/*
 * OMX Video encoder
 * Copyright (C) 2011 Martin Storsjo
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"

#if CONFIG_OMX_RPI
#define OMX_SKIP64BIT
#endif

#include <dlfcn.h>
#include <OMX_Core.h>
#include <OMX_Component.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

#include "libavutil/avstring.h"
#include "libavutil/avutil.h"
#include "libavutil/common.h"
#include "libavutil/imgutils.h"
#include "libavutil/log.h"
#include "libavutil/opt.h"
#include "libavutil/avassert.h"
#include "libavcodec/decode.h"

#include "avcodec.h"
#include "h264.h"
#include "internal.h"
#include <malloc.h>
//#include <libyuv.h>

/* libyuv */
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;

#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
typedef __declspec(align(16)) uint8_t uvec8[16];
typedef __declspec(align(16)) int16_t vec16[8];
#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
typedef int16_t __attribute__((vector_size(16))) vec16;
typedef uint8_t __attribute__((vector_size(16))) uvec8;
#else
typedef int16_t vec16[8];
typedef uint8_t uvec8[16];
#endif

#if defined(__aarch64__) || defined(__arm__)
struct YuvConstants {
  uvec8 kUVCoeff;
  vec16 kRGBCoeffBias;
};

#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},                     \
   {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
    0, 0}}

#define LOAD_YUV_CONSTANTS                 \
  int ub = yuvconstants->kUVCoeff[0];      \
  int vr = yuvconstants->kUVCoeff[1];      \
  int ug = yuvconstants->kUVCoeff[2];      \
  int vg = yuvconstants->kUVCoeff[3];      \
  int yg = yuvconstants->kRGBCoeffBias[0]; \
  int bb = yuvconstants->kRGBCoeffBias[1]; \
  int bg = yuvconstants->kRGBCoeffBias[2]; \
  int br = yuvconstants->kRGBCoeffBias[3]

#define CALC_RGB16                         \
  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
  int b16 = y1 + (u * ub) - bb;            \
  int g16 = y1 + bg - (u * ug + v * vg);   \
  int r16 = y1 + (v * vr) - br

#else
struct YuvConstants {
  uint8_t kUVToB[32];
  uint8_t kUVToG[32];
  uint8_t kUVToR[32];
  int16_t kYToRgb[16];
  int16_t kYBiasToRgb[16];
};

#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}

#define LOAD_YUV_CONSTANTS           \
       int ub = yuvconstants->kUVToB[0];  \
       int ug = yuvconstants->kUVToG[0];  \
       int vg = yuvconstants->kUVToG[1];  \
       int vr = yuvconstants->kUVToR[1];  \
       int yg = yuvconstants->kYToRgb[0]; \
       int yb = yuvconstants->kYBiasToRgb[0]

#define CALC_RGB16                                \
  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
  int8_t ui = (int8_t)u;                          \
  int8_t vi = (int8_t)v;                          \
  ui -= 0x80;                                     \
  vi -= 0x80;                                     \
  int b16 = y1 + (ui * ub);                       \
  int g16 = y1 - (ui * ug + vi * vg);             \
  int r16 = y1 + (vi * vr)

#endif

#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))

#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
               const struct YuvConstants* yuvconstants, int width) {          \
    int r, n;                                                                 \
    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
    memset(temp, 0, 128 * 2); /* for msan */                                  \
    r = width & MASK;                                                         \
    n = width & ~MASK;                                                        \
    if (n > 0) {                                                              \
      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
    }                                                                         \
    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
           SS(r, UVSHIFT) * SBPP2);                                           \
    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
  }

static void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
static void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int width, int depth);
#ifdef HAS_ARM_NEON

static void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
static void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
static void CopyRow_16_NEON(const uint16_t* src, uint16_t* dst, int width, int depth);
static void CopyRow_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int width, int depth);
#endif
static void CopyPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height);
static void CopyPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int width, int height, int depth);


#ifdef HAS_ARM_NEON
static void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width);
static void SplitUVRow_16_C(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width);
static void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width);
static void SplitUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width);
static void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width);
static void SplitUVRow_16_Any_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width);
#endif

void NV12ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf,
    const struct YuvConstants* yuvconstants, int width);

static void SplitUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_u, int dst_stride_u,
    uint8_t* dst_v, int dst_stride_v, int width, int height);
static void SplitUVPlane_16(const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_u, int dst_stride_u,
    uint16_t* dst_v, int dst_stride_v, int width, int height, int depth);

static int NV12ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height);
static int NV16ToI422(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height);
static int NV20LETo422P10LE(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height);
static int P010LETo420P10LE(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height);

void NV12ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants, int width);
void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr,
    const struct YuvConstants* yuvconstants, int width);
int NV12ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants,
    int width, int height);
int NV12ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_argb, int dst_stride_argb, int width, int height);

static __inline int32_t clamp0(int32_t v) {
       return (v < 0) ? 0 : v;
}
static __inline int32_t clamp255(int32_t v) {
       return (v > 255) ? 255 : v;
}
static __inline uint32_t Clamp(int32_t val) {
       int v = clamp0(val);
       return (uint32_t)(clamp255(v));
}

#if defined(__aarch64__) 
#define HAS_ARM_NEON
#define YUVTORGB_REGS                                                          \
  "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
      "v26", "v27", "v28", "v29", "v30", "v31"

#define YUVTORGB_SETUP                                                \
  "ld4r       {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
  "ld4r       {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"

#define READNV12                                 \
  "ldr        d0, [%[src_y]], #8             \n" \
  "ldr        d1, [%[src_uv]], #8            \n" \
  "zip1       v0.16b, v0.16b, v0.16b         \n" \
  "prfm       pldl1keep, [%[src_y], 448]     \n" \
  "tbl        v1.16b, {v1.16b}, v2.16b       \n" \
  "prfm       pldl1keep, [%[src_uv], 448]    \n"

#define YUVTORGB                                          \
  "umull2     v3.4s, v0.8h, v24.8h           \n"          \
  "umull      v6.8h, v1.8b, v30.8b           \n"          \
  "umull      v0.4s, v0.4h, v24.4h           \n"          \
  "umlal2     v6.8h, v1.16b, v31.16b         \n" /* DG */ \
   "uqshrn     v0.4h, v0.4s, #16              \n"          \
   "uqshrn2    v0.8h, v3.4s, #16              \n" /* Y */  \
   "umull      v4.8h, v1.8b, v28.8b           \n" /* DB */ \
   "umull2     v5.8h, v1.16b, v29.16b         \n" /* DR */ \
   "add        v17.8h, v0.8h, v26.8h          \n" /* G */  \
   "add        v16.8h, v0.8h, v4.8h           \n" /* B */  \
   "add        v18.8h, v0.8h, v5.8h           \n" /* R */  \
   "uqsub      v17.8h, v17.8h, v6.8h          \n" /* G */  \
   "uqsub      v16.8h, v16.8h, v25.8h         \n" /* B */  \
   "uqsub      v18.8h, v18.8h, v27.8h         \n" /* R */

#define RGBTORGB8                                \
  "uqshrn     v17.8b, v17.8h, #6             \n" \
  "uqshrn     v16.8b, v16.8h, #6             \n" \
  "uqshrn     v18.8b, v18.8h, #6             \n"

static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
                                 1, 1, 3, 3, 5, 5, 7, 7};
static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, //U plane(vu order)
                                  0, 0, 2, 2, 4, 4, 6, 6}; //V plane

static void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
    __asm__ volatile(
        "1:                                        \n"
        "ldp         q0, q1, [%0], #32             \n"
        "prfm        pldl1keep, [%0, 448]          \n"
        "subs        %w2, %w2, #32                 \n"  // 32 processed per loop
        "stp         q0, q1, [%1], #32             \n"
        "b.gt        1b                            \n"
        : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2  // Output registers
        :                             // Input registers
        : "cc", "memory", "v0", "v1"  // Clobber List
        );
}

static void CopyRow_16_NEON(const uint16_t* src, uint16_t* dst, int width, int depth) {
    int shift = depth - 16;  // Negative for right shift.
    __asm__ volatile(
        "dup         v2.8h, %w3                    \n"
        "1:                                        \n"
        "ld1         {v0.8h, v1.8h}, [%0], #32     \n"  // load 16 Y
        "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
        "ushl        v0.8h, v0.8h, v2.8h           \n"
        "prfm        pldl1keep, [%0, 448]          \n"
        "ushl        v1.8h, v1.8h, v2.8h           \n"
        "st1         {v0.8h, v1.8h}, [%1], #32     \n"  // store 16 Y pixels
        "b.gt        1b                            \n"
        : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2
        : "r"(shift)                // %3  // Output registers
        : "cc", "memory", "v0", "v1", "v2"  // Clobber List
        );
}

static void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) {
    __asm__ volatile(
        "1:                                        \n"
        "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
        "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
        "prfm        pldl1keep, [%0, 448]          \n"
        "st1         {v0.16b}, [%1], #16           \n"  // store U
        "st1         {v1.16b}, [%2], #16           \n"  // store V
        "b.gt        1b                            \n"
        : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3  // Output registers
        :                             // Input registers
        : "cc", "memory", "v0", "v1"  // Clobber List
        );
}

static void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
    int depth, int width) {
    int shift = depth - 16;  // Negative for right shift.
    __asm__ volatile(
        "dup         v2.8h, %w4                    \n"
        "1:                                        \n"
        "ld2         {v0.8h, v1.8h}, [%0], #32     \n"  // load 8 UV
        "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
        "ushl        v0.8h, v0.8h, v2.8h           \n"
        "prfm        pldl1keep, [%0, 448]          \n"
        "ushl        v1.8h, v1.8h, v2.8h           \n"
        "st1         {v0.8h}, [%1], #16            \n"  // store 8 U pixels
        "st1         {v1.8h}, [%2], #16            \n"  // store 8 V pixels
        "b.gt        1b                            \n"
        : "+r"(src_uv),  // %0
        "+r"(dst_u),   // %1
        "+r"(dst_v),   // %2
        "+r"(width)    // %3
        : "r"(shift)     // %4
        : "cc", "memory", "v0", "v1", "v2");
}

void NV12ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants, int width) {
    __asm__ volatile(
        YUVTORGB_SETUP
        "movi        v19.8b, #255                  \n"
        "ldr         q2, [%[kNV12Table]]           \n"
        "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
        "subs        %w[width], %w[width], #8      \n"
        "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
        "b.gt        1b                            \n"
        : [src_y] "+r"(src_y),                                // %[src_y]
        [src_uv] "+r"(src_uv),                              // %[src_uv]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
        : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [kNV12Table] "r"(&kNV12Table)
        : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}

#elif defined(__ARM_NEON__)
#define HAS_ARM_NEON
#define YUVTORGB_REGS \
  "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"

#define YUVTORGB_SETUP                                        \
  "vld4.8     {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
  "vld1.16    {d31[]}, [%[kRGBCoeffBias]]!   \n"              \
  "vld1.16    {d20[], d21[]}, [%[kRGBCoeffBias]]! \n"         \
  "vld1.16    {d22[], d23[]}, [%[kRGBCoeffBias]]! \n"         \
  "vld1.16    {d24[], d25[]}, [%[kRGBCoeffBias]] \n"

#define READNV12                                                              \
  "vld1.8     {d0}, [%[src_y]]!              \n"                              \
  "vld1.8     {d2}, [%[src_uv]]!             \n"                              \
  "vmov.u8    d1, d0                         \n"                              \
  "vmov.u8    d3, d2                         \n"                              \
  "vzip.u8    d0, d1                         \n"                              \
  "vsli.u16   d2, d2, #8                     \n" /* Duplicate low byte (U) */ \
  "vsri.u16   d3, d3, #8                     \n" /* Duplicate high byte (V) */

// Read 8 Y and 4 VU from NV21
#define READNV21                                                               \
  "vld1.8     {d0}, [%[src_y]]!              \n"                               \
  "vld1.8     {d2}, [%[src_vu]]!             \n"                               \
  "vmov.u8    d1, d0                         \n"                               \
  "vmov.u8    d3, d2                         \n"                               \
  "vzip.u8    d0, d1                         \n"                               \
  "vsri.u16   d2, d2, #8                     \n" /* Duplicate high byte (U) */ \
  "vsli.u16   d3, d3, #8                     \n" /* Duplicate low byte (V) */

#define YUVTORGB                                           \
  "vmull.u16  q2, d1, d31                    \n"           \
  "vmull.u8   q8, d3, d29                    \n" /* DGV */ \
  "vmull.u16  q0, d0, d31                    \n"           \
  "vmlal.u8   q8, d2, d28                    \n" /* DG */  \
  "vqshrn.u32 d0, q0, #16                    \n"           \
  "vqshrn.u32 d1, q2, #16                    \n" /* Y */   \
  "vmull.u8   q9, d2, d26                    \n" /* DB */  \
  "vmull.u8   q2, d3, d27                    \n" /* DR */  \
  "vadd.u16   q4, q0, q11                    \n" /* G */   \
  "vadd.u16   q2, q0, q2                     \n" /* R */   \
  "vadd.u16   q0, q0, q9                     \n" /* B */   \
  "vqsub.u16  q1, q4, q8                     \n" /* G */   \
  "vqsub.u16  q0, q0, q10                    \n" /* B */   \
  "vqsub.u16  q2, q2, q12                    \n" /* R */

#define RGBTORGB8                                        \
  "vqshrn.u16 d4, q2, #6                     \n" /* R */ \
  "vqshrn.u16 d2, q1, #6                     \n" /* G */ \
  "vqshrn.u16 d0, q0, #6                     \n" /* B */

static void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
    __asm__ volatile(
        "1:                                        \n"
        "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
        "subs        %2, %2, #32                   \n"  // 32 processed per loop
        "vst1.8      {d0, d1, d2, d3}, [%1]!       \n"  // store 32
        "bgt         1b                            \n"
        : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2  // Output registers
        :                             // Input registers
        : "cc", "memory", "q0", "q1"  // Clobber List
        );
}

static void CopyRow_16_NEON(const uint16_t* src, uint16_t* dst, int width, int depth) {
    int shift = depth - 16;  // Negative for right shift.
    __asm__ volatile(
        "vdup.16     q2, %3                        \n"
        "1:                                        \n"
        "vld2.16     {q0, q1}, [%0]!               \n"  // load 16 Y
        "vshl.u16    q0, q0, q2                    \n"
        "vshl.u16    q1, q1, q2                    \n"
        "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
        "vst2.16     {q0, q1}, [%1]!               \n"  // store 16 Y pixels
        "bgt         1b                            \n"
        : "+r"(src),                  // %0
        "+r"(dst),                  // %1
        "+r"(width)                 // %2
        : "r"(shift)                // %3  // Output registers
        :                             // Input registers
        : "cc", "memory", "q0", "q1", "q2"  // Clobber List
        );
}

static void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) {
  __asm__ volatile(
      "1:                                        \n"
      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
      "subs        %3, %3, #16                   \n"  // 16 processed per loop
      "vst1.8      {q0}, [%1]!                   \n"  // store U
      "vst1.8      {q1}, [%2]!                   \n"  // store V
      "bgt         1b                            \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc", "memory", "q0", "q1"  // Clobber List
  );
}

void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
    int depth, int width) {
    int shift = depth - 16;  // Negative for right shift.
    __asm__ volatile(
        "vdup.16     q2, %4                        \n"
        "1:                                        \n"
        "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
        "vshl.u16    q0, q0, q2                    \n"
        "vshl.u16    q1, q1, q2                    \n"
        "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
        "vst1.16     {q0}, [%1]!                   \n"  // store 8 U pixels
        "vst1.16     {q1}, [%2]!                   \n"  // store 8 V pixels
        "bgt         1b                            \n"
        : "+r"(src_uv),  // %0
        "+r"(dst_u),   // %1
        "+r"(dst_v),   // %2
        "+r"(width)    // %3
        : "r"(shift)     // %4
        : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
}

void NV12ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants, int width) {
    __asm__ volatile(
        YUVTORGB_SETUP
        "vmov.u8     d6, #255                      \n"
        "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
        "subs        %[width], %[width], #8        \n"
        "vst4.8      {d0, d2, d4, d6}, [%[dst_argb]]! \n"
        "bgt         1b                            \n"
        : [src_y] "+r"(src_y),                               // %[src_y]
        [src_uv] "+r"(src_uv),                             // %[src_uv]
        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
        [width] "+r"(width)                                // %[width]
        : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
        : "cc", "memory", YUVTORGB_REGS, "d6");
}

#endif

#ifdef HAS_ARM_NEON
static void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {
    uint8_t temp[128 * 2] __attribute__((aligned(32)));                                  
    int r, n;
    memset(temp, 0, 128); /* for YUY2 and msan */
    r = width & 31;
    n = width & ~31;
    if (n > 0) {
        CopyRow_NEON(src_ptr, dst_ptr, n);
    }
    memcpy(temp, src_ptr + n, r);
    CopyRow_NEON(temp, temp + 128, 32);
    memcpy(dst_ptr + n, temp + 128, r);
}

static void SplitUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width) {
    uint8_t temp[128 * 3] __attribute__((aligned(32)));
    int r, n;
    memset(temp, 0, 128); /* for msan */
    r = width & 15;
    n = width & ~15;
    if (n > 0) {
        SplitUVRow_NEON(src_ptr, dst_u, dst_v, n);
    }
    memcpy(temp, src_ptr + n * 2, r * 2);
    SplitUVRow_NEON(temp, temp + 128, temp + 256, 16);
    memcpy(dst_u + n, temp + 128, r);
    memcpy(dst_v + n, temp + 256, r);
}

static void SplitUVRow_16_Any_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width) { 
    uint16_t temp[16 * 4] __attribute__((aligned(32)));
    int r, n;
    memset(temp, 0, 16 * 4 * 2); /* for msan */
    r = width & 7;
    n = width & ~7;
    if (n > 0) {
        SplitUVRow_16_NEON(src_uv, dst_u, dst_v, depth, n);
    }
    memcpy(temp, src_uv + n * 2, r * 2 * 2);
    SplitUVRow_16_NEON(temp, temp + 32, temp + 48, depth, 8);
    memcpy(dst_u + n, temp + 32, r * 2);
    memcpy(dst_v + n, temp + 48, r * 2);
}

static void CopyRow_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int width, int depth) {
    uint16_t temp[16 * 2] __attribute__((aligned(32)));                                  
    int r, n;
    memset(temp, 0, 16 * 2 * 2);
    r = width & 15;
    n = width & ~15;
    if (n > 0) {
        CopyRow_16_NEON(src_ptr, dst_ptr, n, depth);
    }
    memcpy(temp, src_ptr + n, r * 2);
    CopyRow_16_NEON(temp, temp + 16, 16, depth);
    memcpy(dst_ptr + n, temp + 16, r * 2);
}

ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif

#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
  const struct YuvConstants kYuv##name##Constants __attribute__((aligned(32)))= \
      YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
  const struct YuvConstants kYvu##name##Constants __attribute__((aligned(32)))= \
      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);

#define UB 128 /* max(128, round(2.018 * 64)) */
#define UG 25  /* round(0.391 * 64) */
#define VG 52  /* round(0.813 * 64) */
#define VR 102 /* round(1.596 * 64) */

// Y contribution to R,G,B.  Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
#undef YG
#undef YB
#undef UB
#undef UG
#undef VG
#undef VR

#define IS_ALIGNED(p, a) (!((unsigned long)(p) & ((a)-1)))

static void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
    memcpy(dst, src, count);
}

static __inline void YuvPixel(uint8_t y,
       uint8_t u,
       uint8_t v,
       uint8_t* b,
       uint8_t* g,
       uint8_t* r,
       const struct YuvConstants* yuvconstants) {
       LOAD_YUV_CONSTANTS;
       uint32_t y32 = y * 0x0101;
       CALC_RGB16;
       *b = Clamp((int32_t)(b16) >> 6);
       *g = Clamp((int32_t)(g16) >> 6);
       *r = Clamp((int32_t)(r16) >> 6);
}

static void CopyPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y,
    int width, int height) {
    int y;
    void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
    if (width <= 0 || height == 0) {
        return;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        dst_y = dst_y + (height - 1) * dst_stride_y;
        dst_stride_y = -dst_stride_y;
    }
    // Coalesce rows.
    if (src_stride_y == width && dst_stride_y == width) {
        width *= height;
        height = 1;
        src_stride_y = dst_stride_y = 0;
    }

    // Nothing to do.
    if (src_y == dst_y && src_stride_y == dst_stride_y) {
        return;
    }

#if defined(HAS_ARM_NEON)
    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
#endif

    // Copy plane
    for (y = 0; y < height; ++y) {
        CopyRow(src_y, dst_y, width);
        src_y += src_stride_y;
        dst_y += dst_stride_y;
    }
}

static void CopyPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y,
    int width, int height, int depth) {
    int y;
    void (*CopyRow_16)(const uint16_t* src, uint16_t* dst, int width, int depth) = CopyRow_16_C;
    if (width <= 0 || height == 0) {
        return;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        dst_y = dst_y + (height - 1) * dst_stride_y;
        dst_stride_y = -dst_stride_y;
    }
    // Coalesce rows.
    if (src_stride_y == width * 2 && dst_stride_y == src_stride_y) {
        width *= height;
        height = 1;
        src_stride_y = dst_stride_y = 0;
    }
    // Nothing to do.
    if (src_y == dst_y && src_stride_y == dst_stride_y) {
        return;
    }

#if defined(HAS_ARM_NEON)
    CopyRow_16 = IS_ALIGNED(width, 16) ? CopyRow_16_NEON : CopyRow_16_Any_NEON;
#endif

    // Copy plane
    for (y = 0; y < height; ++y) {
        CopyRow_16(src_y, dst_y, width, depth);
        src_y += src_stride_y;
        dst_y += dst_stride_y;
    }
}

static void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) {
    int x;
    for (x = 0; x < width - 1; x += 2) {
        dst_u[x] = src_uv[0];
        dst_u[x + 1] = src_uv[2];
        dst_v[x] = src_uv[1];
        dst_v[x + 1] = src_uv[3];
        src_uv += 4;
    }
    if (width & 1) {
        dst_u[width - 1] = src_uv[0];
        dst_v[width - 1] = src_uv[1];
    }
}

static void SplitUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_u, int dst_stride_u,
    uint8_t* dst_v, int dst_stride_v, int width, int height) {
    int y;

    void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
        int width) = SplitUVRow_C;
    if (width <= 0 || height == 0) {
        return;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        dst_u = dst_u + (height - 1) * dst_stride_u;
        dst_v = dst_v + (height - 1) * dst_stride_v;
        dst_stride_u = -dst_stride_u;
        dst_stride_v = -dst_stride_v;
    }
    // Coalesce rows.
    if (src_stride_uv == width * 2 && dst_stride_u == width &&
        dst_stride_v == width) {
        width *= height;
        height = 1;
        src_stride_uv = dst_stride_u = dst_stride_v = 0;
    }

#if defined(HAS_ARM_NEON)
    SplitUVRow = SplitUVRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
        SplitUVRow = SplitUVRow_NEON;
    }
#endif

    for (y = 0; y < height; ++y) {
        // Copy a row of UV.
        SplitUVRow(src_uv, dst_u, dst_v, width);
        dst_u += dst_stride_u;
        dst_v += dst_stride_v;
        src_uv += src_stride_uv;
    }
}

static void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int width, int depth) {
    int shift = 16 - depth;
    int x;
    assert(depth >= 8);
    assert(depth <= 16);
    for (x = 0; x < width; ++x) {
        dst[x] = src[x] >> shift;
    }
}

static void SplitUVRow_16_C(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width) {
    int shift = 16 - depth;
    int x;
    assert(depth >= 8);
    assert(depth <= 16);
    for (x = 0; x < width; ++x) {
        dst_u[x] = src_uv[0] >> shift;
        dst_v[x] = src_uv[1] >> shift;
        src_uv += 2;
    }
}

static void SplitUVPlane_16(const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_u, int dst_stride_u,
    uint16_t* dst_v, int dst_stride_v, int width, int height, int depth) {
    int y;
    void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
        uint16_t* dst_v, int depth, int width) =
        SplitUVRow_16_C;
    if (width <= 0 || height == 0) {
        return;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        dst_u = dst_u + (height - 1) * dst_stride_u;
        dst_v = dst_v + (height - 1) * dst_stride_v;
        dst_stride_u = -dst_stride_u;
        dst_stride_v = -dst_stride_v;
    }
    // Coalesce rows.
    if (src_stride_uv == width * 2 && dst_stride_u == width &&
        dst_stride_v == width) {
        width *= height;
        height = 1;
        src_stride_uv = dst_stride_u = dst_stride_v = 0;
    }
#if defined(HAS_ARM_NEON)
    SplitUVRow_16 = SplitUVRow_16_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
        SplitUVRow_16 = SplitUVRow_16_NEON;
    }
#endif

    for (y = 0; y < height; ++y) {
        // Copy a row of UV.
        SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
        dst_u += dst_stride_u;
        dst_v += dst_stride_v;
        src_uv += src_stride_uv;
    }
}

void NV12ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf,
    const struct YuvConstants* yuvconstants, int width) {
    int x;
    for (x = 0; x < width - 1; x += 2) {
        YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
            rgb_buf + 2, yuvconstants);
        rgb_buf[3] = 255;
        YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
            rgb_buf + 6, yuvconstants);
        rgb_buf[7] = 255;
        src_y += 2;
        src_uv += 2;
        rgb_buf += 8;  // Advance 2 pixels.
    }
    if (width & 1) {
        YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
            rgb_buf + 2, yuvconstants);
        rgb_buf[3] = 255;
    }
}

int NV12ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants,
    int width, int height) {
    int y;
    void (*NV12ToARGBRow)(
        const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
        const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
    assert(yuvconstants);
    if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
        return -1;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        dst_argb = dst_argb + (height - 1) * dst_stride_argb;
        dst_stride_argb = -dst_stride_argb;
    }

#if defined(HAS_ARM_NEON)
    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
        NV12ToARGBRow = NV12ToARGBRow_NEON;
    }
#endif

    for (y = 0; y < height; ++y) {
        NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
        dst_argb += dst_stride_argb;
        src_y += src_stride_y;
        if (y & 1) {
            src_uv += src_stride_uv;
        }
    }
    return 0;
}

static int NV12ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height) {
    int halfwidth = (width + 1) >> 1;
    int halfheight = (height + 1) >> 1;
    if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
        return -1;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        halfheight = (height + 1) >> 1;
        src_y = src_y + (height - 1) * src_stride_y;
        src_uv = src_uv + (halfheight - 1) * src_stride_uv;
        src_stride_y = -src_stride_y;
        src_stride_uv = -src_stride_uv;
    }
    // Coalesce rows.
    if (src_stride_y == width && dst_stride_y == width) {
        width *= height;
        height = 1;
        src_stride_y = dst_stride_y = 0;
    }
    // Coalesce rows.
    if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
        dst_stride_v == halfwidth) {
        halfwidth *= halfheight;
        halfheight = 1;
        src_stride_uv = dst_stride_u = dst_stride_v = 0;
    }

    if (dst_y) {
        CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
    }

    // Split UV plane - NV12 / NV21
    SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
        halfwidth, halfheight);

    return 0;
}

static int NV16ToI422(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height) {
    int halfwidth = (width + 1) >> 1;
    int halfheight = height;
    if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
        return -1;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        src_y = src_y + (height - 1) * src_stride_y;
        src_uv = src_uv + (height - 1) * src_stride_uv;
        src_stride_y = -src_stride_y;
        src_stride_uv = -src_stride_uv;
    }
    // Coalesce rows.
    if (src_stride_y == width && dst_stride_y == width) {
        width *= height;
        height = 1;
        src_stride_y = dst_stride_y = 0;
    }
    // Coalesce rows.
    if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
        dst_stride_v == halfwidth) {
        halfwidth *= halfheight;
        halfheight = 1;
        src_stride_uv = dst_stride_u = dst_stride_v = 0;
    }

    if (dst_y) {
        CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
    }

    // Split UV plane - NV12 / NV21
    SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v,
        halfwidth, halfheight);

    return 0;
}

static int NV20LETo422P10LE(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height) {
    int halfwidth = (width + 1) >> 1;
    int halfheight = height;
    if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
        return -1;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        src_y = src_y + (height - 1) * src_stride_y;
        src_uv = src_uv + (height - 1) * src_stride_uv;
        src_stride_y = -src_stride_y;
        src_stride_uv = -src_stride_uv;
    }

    if (dst_y) {
        CopyPlane_16((uint16_t *)src_y, src_stride_y, (uint16_t *)dst_y, dst_stride_y, width, height, 10);
    }

    // Coalesce rows.
    if (src_stride_uv == width * 2 && dst_stride_u == width &&
        dst_stride_v == width) {
        halfwidth *= halfheight;
        halfheight = 1;
        src_stride_uv = dst_stride_u = dst_stride_v = 0;
    }

    // Split UV plane
    SplitUVPlane_16((uint16_t *)src_uv, src_stride_uv, (uint16_t *)dst_u, dst_stride_u, (uint16_t *)dst_v, dst_stride_v,
        halfwidth, halfheight, 10);

    return 0;
}

static int P010LETo420P10LE(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v,
    int dst_stride_v, int width, int height) {
    int halfwidth = (width + 1) >> 1;
    int halfheight = (height + 1) >> 1;
    if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
        return -1;
    }
    // Negative height means invert the image.
    if (height < 0) {
        height = -height;
        src_y = src_y + (height - 1) * src_stride_y;
        src_uv = src_uv + (height - 1) * src_stride_uv;
        src_stride_y = -src_stride_y;
        src_stride_uv = -src_stride_uv;
    }

    if (dst_y) {
        CopyPlane_16((uint16_t *)src_y, src_stride_y, (uint16_t *)dst_y, dst_stride_y, width, height, 10);
    }

    // Coalesce rows.
    if (src_stride_uv == width * 2 && dst_stride_u == width &&
        dst_stride_v == width) {
        halfwidth *= halfheight;
        halfheight = 1;
        src_stride_uv = dst_stride_u = dst_stride_v = 0;
    }

    // Split UV plane
    SplitUVPlane_16((uint16_t *)src_uv, src_stride_uv, (uint16_t *)dst_u, dst_stride_u, (uint16_t *)dst_v, dst_stride_v,
        halfwidth, halfheight, 10);

    return 0;
}

int NV12ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv,
    uint8_t* dst_argb, int dst_stride_argb, int width, int height) {
    return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
        dst_stride_argb, &kYuvI601Constants, width, height);
}

/* end libyuv */

/* extend omx format */
#define OMX_COLOR_FormatYVU422PackedSemiPlanar          52

#define OMX_COLOR_FormatYVU420SemiPlanar                OMX_COLOR_FormatVendorStartUnused + 0x00000001
#define OMX_COLOR_FormatYVU420PackedSemiPlanar          OMX_COLOR_FormatVendorStartUnused + 0x00000002

#define OMX_COLOR_FormatYUV444PackedSemiPlanar          OMX_COLOR_FormatVendorStartUnused + 0x00000021  /**< YUV444 packed semi-planar format */
#define OMX_COLOR_FormatYVU444PackedSemiPlanar          OMX_COLOR_FormatVendorStartUnused + 0x00000022  /**< YVU444 packed semi-planar format */
#define OMX_COLOR_Format10bitMsbYUV420PackedSemiPlanar  OMX_COLOR_FormatVendorStartUnused + 0x00000023  /**< 10-bit Msb YUV420 packed semi-planar format */
#define OMX_COLOR_Format10bitMsbYVU420PackedSemiPlanar  OMX_COLOR_FormatVendorStartUnused + 0x00000024  /**< 10-bit Msb YVU420 packed semi-planar format */
#define OMX_COLOR_Format10bitMsbYUV422PackedSemiPlanar  OMX_COLOR_FormatVendorStartUnused + 0x00000025  /**< 10-bit Msb YVU422 packed semi-planar format */
#define OMX_COLOR_Format10bitMsbYVU422PackedSemiPlanar  OMX_COLOR_FormatVendorStartUnused + 0x00000026  /**< 10-bit Msb YVU422 packed semi-planar format */
#define OMX_COLOR_Format10bitMsbYUV444PackedSemiPlanar  OMX_COLOR_FormatVendorStartUnused + 0x00000027  /**< 10-bit Msb YUV444 packed semi-planar format */
#define OMX_COLOR_Format10bitMsbYVU444PackedSemiPlanar  OMX_COLOR_FormatVendorStartUnused + 0x00000028  /**< 10-bit Msb YVU444 packed semi-planar format */
#define OMX_COLOR_Format10bitYVU422PackedSemiPlanar     OMX_COLOR_FormatVendorStartUnused + 0x00000029  /*Packed version of For 10bit 422 PL21 : packed 3 pixel values in 32bit word*/
#define OMX_COLOR_Format10bitYUV422PackedSemiPlanar     OMX_COLOR_FormatVendorStartUnused + 0x00000030  /*Packed version of For 10bit 422 PL21 : packed 3 pixel values in 32bit word*/

/* extend omx codingtype */
#define OMX_VIDEO_CodingVP8 9
#define OMX_VIDEO_CodingVP9 10
#define OMX_VIDEO_CodingHEVC 11

#define OMX_VIDEO_CodingVC1            OMX_VIDEO_CodingVendorStartUnused + 0x00000002
#define OMX_VIDEO_CodingSorensonSpark  OMX_VIDEO_CodingVendorStartUnused + 0x00000003
#define OMX_VIDEO_CodingVP6            OMX_VIDEO_CodingVendorStartUnused + 0x00000004
#define OMX_VIDEO_CodingAVS            OMX_VIDEO_CodingVendorStartUnused + 0x00000005

#define MIN(a, b)  (((a) < (b)) ? (a) : (b))
//#define FTOMX_MAX_FRAME_BUFFER 12
#define FTOMX_MAX_DELAYED_BUF 100

#define USEC_PER_SEC 1000000

#ifdef OMX_SKIP64BIT
static OMX_TICKS to_omx_ticks(int64_t value)
{
    OMX_TICKS s;
    s.nLowPart  = value & 0xffffffff;
    s.nHighPart = value >> 32;
    return s;
}
static int64_t from_omx_ticks(OMX_TICKS value)
{
    return (((int64_t)value.nHighPart) << 32) | value.nLowPart;
}
#else
#define to_omx_ticks(x) (x)
#define from_omx_ticks(x) (x)
#endif

#define OMX_VERSION_MAJOR 1
#define OMX_VERSION_MINOR 2
#define OMX_VERSION_REVISION 0
#define OMX_VERSION_STEP 0

#define INIT_STRUCT(x) do {                                               \
        x.nSize = sizeof(x);                                              \
        x.nVersion = s->version;                                          \
    } while (0)
#define CHECK(x) do {                                                     \
        if (x != OMX_ErrorNone) {                                         \
            av_log(avctx, AV_LOG_ERROR,                                   \
                   "err %x (%d) on line %d\n", x, x, __LINE__);           \
            return AVERROR_UNKNOWN;                                       \
        }                                                                 \
    } while (0)

#define FT_OMX_INIT_STRUCT(st) do { \
  memset ((st), 0, sizeof (*(st))); \
  (st)->nSize = sizeof (*(st)); \
  (st)->nVersion.s.nVersionMajor = OMX_VERSION_MAJOR; \
  (st)->nVersion.s.nVersionMinor = OMX_VERSION_MINOR; \
  (st)->nVersion.s.nRevision = OMX_VERSION_REVISION; \
  (st)->nVersion.s.nStep = OMX_VERSION_STEP; \
} while(0)

typedef struct FTOMXContext {
    void *lib;
    OMX_ERRORTYPE (*ptr_Init)(void);
    OMX_ERRORTYPE (*ptr_Deinit)(void);
    OMX_ERRORTYPE (*ptr_ComponentNameEnum)(OMX_STRING, OMX_U32, OMX_U32);
    OMX_ERRORTYPE (*ptr_GetHandle)(OMX_HANDLETYPE*, OMX_STRING, OMX_PTR, OMX_CALLBACKTYPE*);
    OMX_ERRORTYPE (*ptr_FreeHandle)(OMX_HANDLETYPE);
    OMX_ERRORTYPE (*ptr_GetComponentsOfRole)(OMX_STRING, OMX_U32*, OMX_U8**);
    OMX_ERRORTYPE (*ptr_GetRolesOfComponent)(OMX_STRING, OMX_U32*, OMX_U8**);
} FTOMXContext;

struct ftomx_frame_tms;
typedef struct ftomx_frame_tms {
    int64_t tms;
    int64_t duration;
    int32_t size;
    int64_t pos;
    struct ftomx_frame_tms *prev;
    struct ftomx_frame_tms *next;
    struct timeval enter;
} FTOMX_FRAME_TMS;

typedef struct FTOMXCodecContext {
    const AVClass *class;
    char *libname;
    FTOMXContext *omx_context;

    AVCodecContext *avctx;

    char component_name[OMX_MAX_STRINGNAME_SIZE];
    OMX_VERSIONTYPE version;
    OMX_HANDLETYPE handle;
    int in_port, out_port;
    OMX_COLOR_FORMATTYPE color_format;
    int stride, plane_size;

    int num_in_buffers, num_out_buffers;
    OMX_BUFFERHEADERTYPE **in_buffer_headers;
    OMX_BUFFERHEADERTYPE **out_buffer_headers;
    int num_free_in_buffers;
    OMX_BUFFERHEADERTYPE **free_in_buffers;
    int num_done_out_buffers;
    OMX_BUFFERHEADERTYPE **done_out_buffers;
    pthread_mutex_t input_mutex;
    pthread_cond_t input_cond;
    pthread_mutex_t output_mutex;
    pthread_cond_t output_cond;

    pthread_mutex_t state_mutex;
    pthread_cond_t state_cond;
    pthread_mutex_t port_mutex;
    pthread_cond_t port_cond;
    pthread_mutex_t flush_mutex;
    pthread_cond_t flush_cond;
    pthread_mutex_t buffer_mutex;
    pthread_cond_t buffer_cond;
    OMX_STATETYPE state;
    OMX_ERRORTYPE error;

    int mutex_cond_inited;

    OMX_BOOL eos_sent, got_eos;

    int profile;

    int64_t last_timestamp;
    OMX_BUFFERHEADERTYPE *cur;
    OMX_BOOL configed;
    OMX_BOOL started;
    OMX_PARAM_PORTDEFINITIONTYPE in_port_params;
    OMX_PARAM_PORTDEFINITIONTYPE out_port_params;

    OMX_BOOL port_flushing[2];

    OMX_BOOL b_has_keyframe;

    int64_t req_num;
    int64_t done_num;

    OMX_BOOL extradata_decoded;
    OMX_BOOL inited;
    OMX_BOOL seeked;
    enum AVPixelFormat orig_format;
    OMX_BOOL format_convert;
    OMX_BOOL low_latency;
    int pending_num;
    OMX_BOOL dropping;
    OMX_BOOL format_bgr;
    OMX_BOOL reordered;

    FTOMX_FRAME_TMS *tms_head;
    FTOMX_FRAME_TMS *tms_tail;
} FTOMXCodecContext;

int ft_omx_switch_port(FTOMXCodecContext *s, int index, OMX_BOOL on, OMX_BOOL wait);
int ft_omx_port_flush(FTOMXCodecContext *s, int index, OMX_BOOL wait);
void ftomx_wait_port_flushed(FTOMXCodecContext *s, int index);
static OMX_BOOL fill_frame_buffer(FTOMXCodecContext *s, AVFrame *pict, uint8_t *buf);
static void update_color_format(FTOMXCodecContext *s, OMX_COLOR_FORMATTYPE color_format);
static av_cold int ftomx_decodec_end(AVCodecContext *avctx);
static void ftomx_reset_buffer(FTOMXCodecContext *s, OMX_BUFFERHEADERTYPE *buf);

static void append_buffer(pthread_mutex_t *mutex, pthread_cond_t *cond,
                          int* array_size, OMX_BUFFERHEADERTYPE **array,
                          OMX_BUFFERHEADERTYPE *buffer);
static void ftomx_reconfig_outport(FTOMXCodecContext *s);
static int ftomx_handle_pkt(AVCodecContext *avctx, void *data,
        int *got_frame, AVPacket *avpkt);
static int ftomx_handle_eof(AVCodecContext *avctx, void *data,
        int *got_frame, AVPacket *avpkt);

static av_cold int ftomx_decode_init(AVCodecContext *avctx);

int64_t reorder_tms(FTOMXCodecContext *s, AVPacket *avpkt);
OMX_BUFFERHEADERTYPE *dequeue_tms(FTOMXCodecContext *s, OMX_BUFFERHEADERTYPE *out_buffer, AVFrame *frame);

int64_t reorder_tms(FTOMXCodecContext *s, AVPacket *avpkt) {
    FTOMX_FRAME_TMS *fp = s->tms_tail;
    FTOMX_FRAME_TMS *cur = malloc(sizeof(FTOMX_FRAME_TMS));
    int64_t pts;
    int64_t dts;
    int64_t duration;
    static int64_t last_duration = 0;
    int64_t tms = 0;

    if (!cur) {
        av_log(s->avctx, AV_LOG_ERROR, "mallloc failed");
        return tms;
    }

    pts = avpkt->pts;
    dts = avpkt->dts;
    duration = avpkt->duration;

    if (((pts == AV_NOPTS_VALUE) || (0 == pts)) && (s->avctx->reordered_opaque != AV_NOPTS_VALUE)) {
        pts = s->avctx->reordered_opaque;
        s->reordered = OMX_TRUE;
    }

    memset(cur, 0, sizeof(*cur));
    if (AV_NOPTS_VALUE != pts) {
        tms = pts;
    } else {
        tms = dts;
    }

    s->pending_num++;

    if (!tms && s->tms_head && (tms == s->tms_head->tms)) {
        tms = last_duration;
    }

    cur->tms = tms;
    cur->duration = duration;
    cur->pos = avpkt->pos;
    cur->size = avpkt->size;
    gettimeofday(&cur->enter, NULL);

    if (!s->tms_head) {
        s->tms_head = cur;
        s->tms_tail = cur;
        last_duration = duration;
        return tms;
    }

    while (fp) {
        if (fp->tms <= tms) {
            break;
        }
        fp = fp->prev;
    }

    if (fp) {
        cur->next = fp->next;
        cur->prev = fp;
        if (fp->next) {
            fp->next->prev = cur;
        }
        fp->next = cur;
    } else {
        cur->next = s->tms_head;
        s->tms_head = cur;
    }

    if (!cur->next) {
        s->tms_tail = cur;
    }

    last_duration = duration;

    return tms;
}

OMX_BUFFERHEADERTYPE *dequeue_tms(FTOMXCodecContext *s, OMX_BUFFERHEADERTYPE *out_buffer, AVFrame *frame) {
    FTOMX_FRAME_TMS *next = s->tms_head;
    struct timeval now;
    uint64_t delayed = 0;

    if (!out_buffer) {
        return NULL;
    }

    if (!s->tms_head) {
        return out_buffer;
    }

    if (s->low_latency && !s->eos_sent) {
        gettimeofday(&now, NULL);
        delayed = (now.tv_sec - s->tms_head->enter.tv_sec) * 1000000 + now.tv_usec - s->tms_head->enter.tv_usec;
        if (delayed > 90 * 1000) {
            s->dropping = OMX_TRUE;
            av_log(s->avctx, AV_LOG_ERROR, "drop delayed frame, timestamp:%ld, pending_num:%d, delayed:%ld\n", s->tms_head->tms, s->pending_num, delayed);
            //printf("beyond max delay time, timestamp:%ld, pending_num:%d, delayed:%ld\n", s->tms_head->tms, s->pending_num, delayed);

            ftomx_reset_buffer(s, out_buffer);
            OMX_FillThisBuffer(s->handle, out_buffer); 
            out_buffer = NULL;

            next = s->tms_head->next;
            if (next) {
                next->prev = NULL;
            }
            free(s->tms_head);
            s->tms_head = next;
            s->pending_num--;

            return NULL;
        } else if (s->dropping) {
            av_log(s->avctx, AV_LOG_ERROR, "drop delayed frame, timestamp:%ld, pending_num:%d, delayed:%ld\n", s->tms_head->tms, s->pending_num, delayed);
            //printf("drop delayed frame, timestamp:%ld, pending_num:%d, delayed:%ld\n", s->tms_head->tms, s->pending_num, delayed);

            ftomx_reset_buffer(s, out_buffer);
            OMX_FillThisBuffer(s->handle, out_buffer); 
            out_buffer = NULL;

            next = s->tms_head->next;
            if (next) {
                next->prev = NULL;
            }
            free(s->tms_head);
            s->tms_head = next;
            s->pending_num--;

            delayed = (now.tv_sec - s->tms_head->enter.tv_sec) * 1000000 + now.tv_usec - s->tms_head->enter.tv_usec;
            if (s->pending_num <= 1) {
                s->dropping = OMX_FALSE;
            }
            return NULL;
        }
    }

    out_buffer->nTimeStamp = to_omx_ticks(s->tms_head->tms);
    frame->pts = s->tms_head->tms; /* fix pts for current frame */
    frame->pkt_pos = s->tms_head->pos;
    frame->pkt_duration = s->tms_head->duration;
    frame->pkt_size = s->tms_head->size;

    next = s->tms_head->next;
    free(s->tms_head);
    s->tms_head = next;
    if (next) {
        next->prev = NULL;
    }

    s->pending_num--;
    return out_buffer;
}

static av_cold int ft_try_load(FTOMXContext *s, void *logavctx,
                                const char *libname)
{
#if 0 //use static lib
    s->ptr_Init                = OMX_Init;
    s->ptr_Deinit              = OMX_Deinit;
    s->ptr_ComponentNameEnum   = OMX_ComponentNameEnum;
    s->ptr_GetHandle           = OMX_GetHandle;
    s->ptr_FreeHandle          = OMX_FreeHandle;
    s->ptr_GetComponentsOfRole = OMX_GetComponentsOfRole;
    s->ptr_GetRolesOfComponent = OMX_GetRolesOfComponent;
    av_log(logavctx, AV_LOG_INFO, "use static omx lib\n");
#else
    s->lib = dlopen(libname, RTLD_NOW | RTLD_GLOBAL);
    if (!s->lib) {
        av_log(logavctx, AV_LOG_WARNING, "%s not found, %s\n", libname, strerror(errno));
        return AVERROR_ENCODER_NOT_FOUND;
    }
    av_log(logavctx, AV_LOG_INFO, "%s load success\n", libname);

    s->ptr_Init                = dlsym(s->lib, "OMX_Init");
    s->ptr_Deinit              = dlsym(s->lib, "OMX_Deinit");
    s->ptr_ComponentNameEnum   = dlsym(s->lib, "OMX_ComponentNameEnum");
    s->ptr_GetHandle           = dlsym(s->lib, "OMX_GetHandle");
    s->ptr_FreeHandle          = dlsym(s->lib, "OMX_FreeHandle");
    s->ptr_GetComponentsOfRole = dlsym(s->lib, "OMX_GetComponentsOfRole");
    s->ptr_GetRolesOfComponent = dlsym(s->lib, "OMX_GetRolesOfComponent");
#endif

    if (!s->ptr_Init || !s->ptr_Deinit || !s->ptr_ComponentNameEnum ||
        !s->ptr_GetHandle || !s->ptr_FreeHandle ||
        !s->ptr_GetComponentsOfRole || !s->ptr_GetRolesOfComponent) {
        av_log(logavctx, AV_LOG_WARNING, "invalid library");

        if (s->lib) {
            dlclose(s->lib);
        }
        s->lib = NULL;
        return AVERROR_ENCODER_NOT_FOUND;
    }
    return 0;
}

static av_cold FTOMXContext *ftomx_init(void *logavctx, const char *libname)
{
    static const char * const libnames[] = {
        "libomx_ftvxd.so", NULL,
        "libomx_vxd.so", NULL,
        NULL
    };
    const char* const* nameptr;
    int ret = AVERROR_ENCODER_NOT_FOUND;
    FTOMXContext *omx_context;

    omx_context = av_mallocz(sizeof(*omx_context));
    if (!omx_context)
        return NULL;
    if (libname) {
        ret = ft_try_load(omx_context, logavctx, libname);
        if (ret < 0) {
            av_free(omx_context);
            return NULL;
        }
    } else {
        for (nameptr = libnames; *nameptr; nameptr += 2)
            if (!(ret = ft_try_load(omx_context, logavctx, nameptr[0])))
                break;
        if (!*nameptr) {
            av_free(omx_context);
            return NULL;
        }
    }

    ret = omx_context->ptr_Init();
#ifdef HAS_ARM_NEON
    av_log(logavctx, AV_LOG_INFO, "omx init ret:0x%x, uid:%d\n", ret, getuid());
#endif
    return omx_context;
}

static av_cold void omx_deinit(FTOMXContext *omx_context)
{
    if (!omx_context)
        return;
    omx_context->ptr_Deinit();
    if (omx_context->lib) {
        dlclose(omx_context->lib);
    }
    av_free(omx_context);
}

static OMX_ERRORTYPE ft_omx_get_port_definition(FTOMXCodecContext *s,
    int index,
    OMX_PARAM_PORTDEFINITIONTYPE * port_def);

static void ftomx_reset_buffer(FTOMXCodecContext *s, OMX_BUFFERHEADERTYPE *buf) {
    buf->nFlags = 0;
    buf->nOffset = 0;
    buf->nFilledLen = 0;
    buf->nTimeStamp = 0;
    buf->pAppPrivate = s;
}

static void append_buffer(pthread_mutex_t *mutex, pthread_cond_t *cond,
                          int* array_size, OMX_BUFFERHEADERTYPE **array,
                          OMX_BUFFERHEADERTYPE *buffer)
{
    pthread_mutex_lock(mutex);
    array[(*array_size)++] = buffer;
    pthread_cond_broadcast(cond);
    pthread_mutex_unlock(mutex);
}

static OMX_BUFFERHEADERTYPE *get_buffer(pthread_mutex_t *mutex, pthread_cond_t *cond,
                                        int* array_size, OMX_BUFFERHEADERTYPE **array,
                                        int wait, int timeout)
{
    OMX_BUFFERHEADERTYPE *buffer;
    struct timeval tv;
    struct timespec ts;
    suseconds_t usec;

    if (wait && timeout) {
        gettimeofday(&tv, NULL);
        ts.tv_sec = tv.tv_sec;
        usec = tv.tv_usec + timeout * 1000;
        if (usec >= 1000000) {
            usec -= 1000000;
            ts.tv_sec += 1;
        }
        ts.tv_nsec = usec * 1000;
    }

    pthread_mutex_lock(mutex);
    if (wait) {
        while (!*array_size) {
            if (!timeout) {
                pthread_cond_wait(cond, mutex);
            } else {
                if (pthread_cond_timedwait(cond, mutex, &ts) == ETIMEDOUT) {
                    break;
                }
            }
        }
    }

    if (*array_size > 0) {
        buffer = array[0];
        (*array_size)--;
        memmove(&array[0], &array[1], (*array_size) * sizeof(OMX_BUFFERHEADERTYPE*));
    } else {
        buffer = NULL;
    }
    pthread_mutex_unlock(mutex);
    return buffer;
}

static OMX_ERRORTYPE event_handler(OMX_HANDLETYPE component, OMX_PTR app_data, OMX_EVENTTYPE event,
                                   OMX_U32 data1, OMX_U32 data2, OMX_PTR event_data)
{
    FTOMXCodecContext *s = app_data;
    // This uses casts in the printfs, since OMX_U32 actually is a typedef for
    // unsigned long in official header versions (but there are also modified
    // versions where it is something else).
    switch (event) {
    case OMX_EventError:
        pthread_mutex_lock(&s->state_mutex);
        av_log(s->avctx, AV_LOG_ERROR, "OMX error %"PRIx32"\n", (uint32_t) data1);
        s->error = data1;
        pthread_cond_broadcast(&s->state_cond);
        pthread_mutex_unlock(&s->state_mutex);
        break;
    case OMX_EventCmdComplete:
        switch (data1) {
            case OMX_CommandStateSet:
                pthread_mutex_lock(&s->state_mutex);
                s->state = data2;
                av_log(s->avctx, AV_LOG_INFO, "OMX state changed to %"PRIu32"\n", (uint32_t) data2);
                pthread_cond_broadcast(&s->state_cond);
                pthread_mutex_unlock(&s->state_mutex);
                break;

            case OMX_CommandPortDisable:
                pthread_mutex_lock(&s->port_mutex);
                av_log(s->avctx, AV_LOG_INFO, "OMX port %"PRIu32" disabled\n", (uint32_t) data2);
                pthread_cond_broadcast(&s->port_cond);
                pthread_mutex_unlock(&s->port_mutex);
                break;

            case OMX_CommandPortEnable:
                pthread_mutex_lock(&s->port_mutex);
                av_log(s->avctx, AV_LOG_INFO, "OMX port %"PRIu32" enabled\n", (uint32_t) data2);
                pthread_cond_broadcast(&s->port_cond);
                pthread_mutex_unlock(&s->port_mutex);
                break;

            case OMX_CommandFlush:
                pthread_mutex_lock(&s->flush_mutex);
                s->port_flushing[data2] = OMX_FALSE;
                pthread_cond_broadcast(&s->flush_cond);
                pthread_mutex_unlock(&s->flush_mutex);
                av_log(s->avctx, AV_LOG_INFO, "OMX port %"PRIu32" flushed\n", (uint32_t) data2);
                break;

            default:
                av_log(s->avctx, AV_LOG_ERROR, "OMX command complete, command %"PRIu32", value %"PRIu32"\n",
                        (uint32_t) data1, (uint32_t) data2);
                break;
        }
        break;

    case OMX_EventPortSettingsChanged:
        av_log(s->avctx, AV_LOG_INFO, "OMX port %"PRIu32" settings changed\n", (uint32_t) data1);
        if (data2 == OMX_IndexParamPortDefinition) {
            if (data1 == s->in_port) {
                ft_omx_get_port_definition(s, (uint32_t)data1, &s->in_port_params);
            } else {
                ft_omx_get_port_definition(s, (uint32_t)data1, &s->out_port_params);

                pthread_mutex_lock(&s->port_mutex);
                s->configed = OMX_TRUE;
                pthread_cond_broadcast(&s->port_cond);
                pthread_mutex_unlock(&s->port_mutex);
            }
        }
        break;

    default:
        av_log(s->avctx, AV_LOG_ERROR, "OMX event %d %"PRIx32" %"PRIx32"\n",
                                         event, (uint32_t) data1, (uint32_t) data2);
        break;
    }
    return OMX_ErrorNone;
}

static OMX_ERRORTYPE empty_buffer_done(OMX_HANDLETYPE component, OMX_PTR app_data,
                                       OMX_BUFFERHEADERTYPE *buffer)
{
    FTOMXCodecContext *s = app_data;

    buffer->nFlags = 0;
    buffer->nOffset = 0;
    buffer->nFilledLen = 0;
    buffer->nTimeStamp = 0;

    append_buffer(&s->input_mutex, &s->input_cond,
                  &s->num_free_in_buffers, s->free_in_buffers, buffer);
    return OMX_ErrorNone;
}

static OMX_ERRORTYPE fill_buffer_done(OMX_HANDLETYPE component, OMX_PTR app_data,
                                      OMX_BUFFERHEADERTYPE *buffer)
{
    FTOMXCodecContext *s = NULL;
    av_assert0(buffer->pAppPrivate);
    s = (FTOMXCodecContext *)buffer->pAppPrivate;
    append_buffer(&s->output_mutex, &s->output_cond,
                  &s->num_done_out_buffers, s->done_out_buffers, buffer);
    return OMX_ErrorNone;
}

static const OMX_CALLBACKTYPE callbacks = {
    event_handler,
    empty_buffer_done,
    fill_buffer_done
};

static av_cold int find_component(FTOMXContext *ft_context, void *logavctx,
                                  const char *role, char *str, int str_size)
{
    OMX_U32 i, num = 0;
    char **components;
    int ret = 0;

    ft_context->ptr_GetComponentsOfRole((OMX_STRING) role, &num, NULL);
    if (!num) {
        av_log(logavctx, AV_LOG_WARNING, "No component for role %s found\n", role);
        return AVERROR_ENCODER_NOT_FOUND;
    }
    components = av_mallocz_array(num, sizeof(*components));
    if (!components)
        return AVERROR(ENOMEM);
    for (i = 0; i < num; i++) {
        components[i] = av_mallocz(OMX_MAX_STRINGNAME_SIZE);
        if (!components[i]) {
            ret = AVERROR(ENOMEM);
            goto end;
        }
    }
    ft_context->ptr_GetComponentsOfRole((OMX_STRING) role, &num, (OMX_U8**) components);
    av_strlcpy(str, components[0], str_size);
end:
    for (i = 0; i < num; i++)
        av_free(components[i]);
    av_free(components);
    return ret;
}

static av_cold int wait_for_state(FTOMXCodecContext *s, OMX_STATETYPE state)
{
    int ret = 0;
    pthread_mutex_lock(&s->state_mutex);
    while (s->state != state && s->error == OMX_ErrorNone)
        pthread_cond_wait(&s->state_cond, &s->state_mutex);
    if (s->error != OMX_ErrorNone)
        ret = AVERROR_ENCODER_NOT_FOUND;
    pthread_mutex_unlock(&s->state_mutex);
    return ret;
}

static OMX_ERRORTYPE ft_omx_get_port_definition(FTOMXCodecContext *s,
    int index,
    OMX_PARAM_PORTDEFINITIONTYPE * port_def)
{
    OMX_ERRORTYPE err;
    FT_OMX_INIT_STRUCT(port_def);
    port_def->nPortIndex = index;

    err = OMX_GetParameter(s->handle, OMX_IndexParamPortDefinition, port_def);

    return err;
}

static av_cold int wait_for_port_onoff(FTOMXCodecContext *s, int index, OMX_BOOL enable)
{
    int ret = 0;
    OMX_PARAM_PORTDEFINITIONTYPE *port_def = NULL;
    if (index == s->in_port) {
        port_def = &s->in_port_params;
    } else {
        port_def = &s->out_port_params;
    }

    pthread_mutex_lock(&s->port_mutex);
    ft_omx_get_port_definition(s, index, port_def);
    while (port_def->bEnabled != enable) {
        pthread_cond_wait(&s->port_cond, &s->port_mutex);
        ft_omx_get_port_definition(s, index, port_def);
    }
    if (s->error != OMX_ErrorNone)
        ret = AVERROR_ENCODER_NOT_FOUND;
    pthread_mutex_unlock(&s->port_mutex);
    return ret;
}

static OMX_ERRORTYPE ft_omx_update_port_definition (FTOMXCodecContext *s,
    OMX_PARAM_PORTDEFINITIONTYPE * port_def)
{
    OMX_ERRORTYPE err;

    if (port_def) {
        err = OMX_SetParameter(s->handle, OMX_IndexParamPortDefinition, port_def);
    }

    OMX_GetParameter(s->handle, OMX_IndexParamPortDefinition, port_def);

    return err;
}

int ft_omx_switch_port(FTOMXCodecContext *s, int index, OMX_BOOL on, OMX_BOOL wait) {
    int err = 0;
    AVCodecContext *avctx = s->avctx;

    if (on) {
        err = OMX_SendCommand(s->handle, OMX_CommandPortEnable, index, NULL);
    } else {
        err = OMX_SendCommand(s->handle, OMX_CommandPortDisable, index, NULL);
    }
    CHECK(err);

    if (wait) {
        wait_for_port_onoff(s, index, on);
    }
    return err;
}

int ft_omx_port_flush(FTOMXCodecContext *s, int index, OMX_BOOL wait) {
    int err = 0;
    s->port_flushing[index] = OMX_TRUE;

    err = OMX_SendCommand(s->handle, OMX_CommandFlush, index, NULL);
    if (wait) {
        pthread_mutex_lock(&s->flush_mutex);
        while (s->port_flushing[index]) {
            pthread_cond_wait(&s->flush_cond, &s->flush_mutex);
        }
        pthread_mutex_unlock(&s->flush_mutex);
    }

    return err;
}

void ftomx_wait_port_flushed(FTOMXCodecContext *s, int index) {
    pthread_mutex_lock(&s->flush_mutex);
    while (s->port_flushing[index]) {
        pthread_cond_wait(&s->flush_cond, &s->flush_mutex);
    }
    pthread_mutex_unlock(&s->flush_mutex);

    return;
}

static av_cold void wait_port_configed(FTOMXCodecContext *s)
{
    int width, height;

    if (OMX_PortDomainVideo == s->out_port_params.eDomain) {
        width = s->out_port_params.format.video.nFrameWidth;
        height = s->out_port_params.format.video.nFrameHeight;
    } else {
        width = s->out_port_params.format.image.nFrameWidth;
        height = s->out_port_params.format.image.nFrameHeight;
    }

    if (s->avctx->width && s->avctx->height) {
        if ((width  < s->avctx->width) || (height < s->avctx->height)) {
            s->configed = OMX_FALSE;
        }
    }

    while (!s->configed) {
        pthread_mutex_lock(&s->port_mutex);
        if (!s->configed) {
            pthread_cond_wait(&s->port_cond, &s->port_mutex);
        }
        pthread_mutex_unlock(&s->port_mutex);

        if (s->avctx->width && s->avctx->height) {
            if (OMX_PortDomainVideo == s->out_port_params.eDomain) {
                width = s->out_port_params.format.video.nFrameWidth;
                height = s->out_port_params.format.video.nFrameHeight;
            } else {
                width = s->out_port_params.format.image.nFrameWidth;
                height = s->out_port_params.format.image.nFrameHeight;
            }
            if ((width  < s->avctx->width) || (height < s->avctx->height)) {
                s->configed = OMX_FALSE;
            }
        }
    }
    return;
}

static av_cold int ft_component_init(AVCodecContext *avctx, const char *role)
{
    FTOMXCodecContext *s = avctx->priv_data;
    OMX_PARAM_COMPONENTROLETYPE role_params = { 0 };
    OMX_VIDEO_PARAM_PORTFORMATTYPE video_port_format = { 0 };
    OMX_IMAGE_PARAM_PORTFORMATTYPE image_port_format = { 0 };
    OMX_PARAM_PORTDEFINITIONTYPE *port_def = NULL;
    OMX_ERRORTYPE err;
    int i;

    s->version.s.nVersionMajor = 1;
    s->version.s.nVersionMinor = 2;
    s->version.s.nRevision     = 0;

    err = s->omx_context->ptr_GetHandle(&s->handle, s->component_name, s, (OMX_CALLBACKTYPE*) &callbacks);
    if (err != OMX_ErrorNone) {
        av_log(avctx, AV_LOG_ERROR, "OMX_GetHandle(%s) failed: %x\n", s->component_name, err);
        return AVERROR_UNKNOWN;
    }

    // This one crashes the mediaserver on qcom, if used over IOMX
    INIT_STRUCT(role_params);
    av_strlcpy(role_params.cRole, role, sizeof(role_params.cRole));
    // Intentionally ignore errors on this one
    OMX_SetParameter(s->handle, OMX_IndexParamStandardComponentRole, &role_params);

    s->in_port = 0;
    s->out_port = 1;
    ft_omx_get_port_definition(s, 0, &s->in_port_params);
    if (s->in_port_params.eDir != OMX_DirInput) {
        memcpy(&s->out_port_params, &s->in_port_params, sizeof(s->in_port_params));
        ft_omx_get_port_definition(s, 1, &s->in_port_params);
        s->in_port = 1;
        s->out_port = 0;
    } else {
        ft_omx_get_port_definition(s, 1, &s->out_port_params);
    }

    s->stride     = avctx->width;
    s->plane_size = avctx->height;

    /* config in port */
    port_def = &s->in_port_params;
    if (OMX_PortDomainVideo == port_def->eDomain) {
        port_def->format.video.nFrameWidth  = avctx->width;
        port_def->format.video.nFrameHeight = avctx->height;
        if (avctx->framerate.den > 0 && avctx->framerate.num > 0)
            port_def->format.video.xFramerate = (1LL << 16) * avctx->framerate.num / avctx->framerate.den;
        else
            port_def->format.video.xFramerate = (1LL << 16) * avctx->time_base.den / avctx->time_base.num;
    } else {
        port_def->format.image.nFrameWidth  = avctx->width;
        port_def->format.image.nFrameHeight = avctx->height;
    }

    err = ft_omx_update_port_definition(s, port_def);
    CHECK(err);

    if (OMX_PortDomainVideo == port_def->eDomain) {
        s->stride         = port_def->format.video.nStride;
        s->plane_size     = port_def->format.video.nSliceHeight;
    } else {
        s->stride         = port_def->format.image.nStride;
        s->plane_size     = port_def->format.image.nSliceHeight;
    }
    s->num_in_buffers = port_def->nBufferCountActual;

    if (avctx->codec->id == AV_CODEC_ID_H264) {
        OMX_VIDEO_PARAM_AVCTYPE avc = { 0 };
        INIT_STRUCT(avc);
        avc.nPortIndex = s->out_port;
        err = OMX_GetParameter(s->handle, OMX_IndexParamVideoAvc, &avc);
        CHECK(err);
        avc.nBFrames = 0;
        avc.nPFrames = avctx->gop_size - 1;
        switch (s->profile == FF_PROFILE_UNKNOWN ? avctx->profile : s->profile) {
        case FF_PROFILE_H264_BASELINE:
            avc.eProfile = OMX_VIDEO_AVCProfileBaseline;
            break;
        case FF_PROFILE_H264_MAIN:
            avc.eProfile = OMX_VIDEO_AVCProfileMain;
            break;
        case FF_PROFILE_H264_HIGH:
            avc.eProfile = OMX_VIDEO_AVCProfileHigh;
            break;
        default:
            break;
        }
        err = OMX_SetParameter(s->handle, OMX_IndexParamVideoAvc, &avc);
        CHECK(err);
    }

    /* config output port */
    port_def = &s->out_port_params;
    s->color_format = 0;
    for (i = 0; ; i++) {
        if (OMX_PortDomainVideo == port_def->eDomain) {
            INIT_STRUCT(video_port_format);
            video_port_format.nIndex = i;
            video_port_format.nPortIndex = s->out_port;
            if (OMX_GetParameter(s->handle, OMX_IndexParamVideoPortFormat, &video_port_format) != OMX_ErrorNone)
                break;
            if (video_port_format.eColorFormat == OMX_COLOR_FormatYUV420PackedSemiPlanar ||
                    video_port_format.eColorFormat == OMX_COLOR_FormatYUV420SemiPlanar) {
                s->color_format = video_port_format.eColorFormat;
                break;
            }
        } else {
            INIT_STRUCT(image_port_format);
            image_port_format.nIndex = i;
            image_port_format.nPortIndex = s->out_port;
            if (OMX_GetParameter(s->handle, OMX_IndexParamImagePortFormat, &image_port_format) != OMX_ErrorNone)
                break;
            if (image_port_format.eColorFormat == OMX_COLOR_FormatYUV420PackedSemiPlanar ||
                    image_port_format.eColorFormat == OMX_COLOR_FormatYUV420SemiPlanar) {
                s->color_format = image_port_format.eColorFormat;
                break;
            }
        }
    }

    if (s->color_format == 0) {
        av_log(avctx, AV_LOG_ERROR, "No supported pixel formats (%d formats available)\n", i);
        return AVERROR_UNKNOWN;
    }

    if (OMX_PortDomainVideo == port_def->eDomain) {
        port_def->format.video.eColorFormat = s->color_format;
    } else {
        port_def->format.image.eColorFormat = s->color_format;
    }

    switch (avctx->codec->id) {
    case AV_CODEC_ID_H263:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingH263;
        break;
    case AV_CODEC_ID_H264:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingAVC;
        break;
    case AV_CODEC_ID_HEVC:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingHEVC;
        break;
    case AV_CODEC_ID_MPEG2VIDEO:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingMPEG2;
        break;
    case AV_CODEC_ID_MPEG4:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingMPEG4;
        break;
    case AV_CODEC_ID_VC1:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingVC1;
        break;
    case AV_CODEC_ID_VP6:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingVP6;
        break;
    case AV_CODEC_ID_VP8:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingVP8;
        break;
    case AV_CODEC_ID_JPEG2000:
        port_def->format.image.eCompressionFormat = OMX_IMAGE_CodingJPEG2K;
        //port_def->format.image.eCompressionFormat = OMX_IMAGE_CodingJPEG;
        break;
    case AV_CODEC_ID_MJPEG:
        port_def->format.image.eCompressionFormat = OMX_IMAGE_CodingJPEG;
        break;
    case AV_CODEC_ID_CAVS:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingAVS;
        break;
    case AV_CODEC_ID_RV30:
    case AV_CODEC_ID_RV40:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingRV;
        break;
    case AV_CODEC_ID_FLV1:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingSorensonSpark;
        break;
    default:
        port_def->format.video.eCompressionFormat = OMX_VIDEO_CodingAutoDetect;
        break;
    }
    if (OMX_PortDomainVideo == port_def->eDomain) {
        port_def->format.video.nFrameWidth  = avctx->width;
        port_def->format.video.nFrameHeight = avctx->height;
        port_def->eDomain = OMX_PortDomainVideo;
    } else {
        port_def->format.image.nFrameWidth  = avctx->width;
        port_def->format.image.nFrameHeight = avctx->height;
        port_def->eDomain = OMX_PortDomainImage;
    }
    err = ft_omx_update_port_definition(s, port_def);

    //s->num_out_buffers = FTOMX_MAX_FRAME_BUFFER;
    s->in_buffer_headers  = av_mallocz(sizeof(OMX_BUFFERHEADERTYPE*) * s->num_in_buffers);
    s->free_in_buffers    = av_mallocz(sizeof(OMX_BUFFERHEADERTYPE*) * s->num_in_buffers);
    if (!s->in_buffer_headers || !s->free_in_buffers)
        return AVERROR(ENOMEM);

    /* disable output port */
    ft_omx_switch_port(s, s->out_port, OMX_FALSE, OMX_TRUE);
    err = OMX_SendCommand(s->handle, OMX_CommandStateSet, OMX_StateIdle, NULL);
    CHECK(err);

    for (i = 0; i < s->num_in_buffers && err == OMX_ErrorNone; i++) {
        err = OMX_AllocateBuffer(s->handle, &s->in_buffer_headers[i],  s->in_port,  s, s->in_port_params.nBufferSize);
    }
    CHECK(err);
    s->num_in_buffers = i;
    for (i = 0; i < s->num_in_buffers; i++)
        s->free_in_buffers[s->num_free_in_buffers++] = s->in_buffer_headers[i];

    if (wait_for_state(s, OMX_StateIdle) < 0) {
        av_log(avctx, AV_LOG_ERROR, "wait OMX_StateIdle failed\n");
        return AVERROR_UNKNOWN;
    }

    err = OMX_SendCommand(s->handle, OMX_CommandStateSet, OMX_StateExecuting, NULL);
    CHECK(err);
    if (wait_for_state(s, OMX_StateExecuting) < 0) {
        av_log(avctx, AV_LOG_ERROR, "wait OMX_StateExecuting failed\n");
        return AVERROR_UNKNOWN;
    }

    return err != OMX_ErrorNone ? AVERROR_UNKNOWN : 0;
}

static void ftomx_restart(AVCodecContext *avctx) {
    ftomx_decodec_end(avctx);
    ftomx_decode_init(avctx);
}

static void ftomx_flush(AVCodecContext *avctx) {
    int executing;
    FTOMXCodecContext *s = avctx->priv_data;
    OMX_BUFFERHEADERTYPE* out_buffer = NULL;
    FTOMX_FRAME_TMS *fp = NULL;
    FTOMX_FRAME_TMS *next = NULL;
    int i = 0;

    fp = s->tms_head;
    while (fp) {
        next = fp->next;
        free(fp);
        fp = next;
    }
    s->tms_head = NULL;
    s->tms_tail = NULL;
    s->pending_num = 0;

    s->seeked = OMX_TRUE;

    if (!s->started || s->eos_sent) {
        av_log(avctx, AV_LOG_ERROR, "stream stopped\n");
        return;
    }

    pthread_mutex_lock(&s->state_mutex);
    executing = s->state == OMX_StateExecuting;
    pthread_mutex_unlock(&s->state_mutex);

    if (s->started && s->got_eos) {
        ftomx_restart(avctx);
    } else {
        if (executing) {
            OMX_SendCommand(s->handle, OMX_CommandStateSet, OMX_StatePause, NULL);
            wait_for_state(s, OMX_StatePause);

            ft_omx_port_flush(s, s->in_port, OMX_TRUE);

            OMX_SendCommand(s->handle, OMX_CommandStateSet, OMX_StateExecuting, NULL);
            wait_for_state(s, OMX_StateExecuting);

            ft_omx_port_flush(s, s->out_port, OMX_TRUE);

            av_log(avctx, AV_LOG_TRACE, "%d buffer received\n", s->num_done_out_buffers);

            for (i = 0; i < s->num_out_buffers; i++) {
                if (!s->out_buffer_headers[i]) {
                    continue;
                }

                out_buffer = get_buffer(&s->output_mutex, &s->output_cond,
                    &s->num_done_out_buffers, s->done_out_buffers, 1, 0);
                if (out_buffer->nFlags & OMX_BUFFERFLAG_EOS) {
                    av_log(avctx, AV_LOG_DEBUG, "receive eos when flush\n");
                }
                ftomx_reset_buffer(s, out_buffer);
                OMX_FillThisBuffer(s->handle, out_buffer); 
            }

            if (s->eos_sent && !s->got_eos) {
                av_log(avctx, AV_LOG_DEBUG, "need resent eos after flush");
                s->eos_sent = OMX_FALSE; //need resent eos
            }
            av_log(avctx, AV_LOG_TRACE, "last timestamp before flush 0x%lx\n", s->last_timestamp);
        }
    }
}

static av_cold void cleanup(FTOMXCodecContext *s)
{
    int i;
    FTOMX_FRAME_TMS *fp = NULL;
    FTOMX_FRAME_TMS *next = NULL;

    s->started = OMX_FALSE;

    fp = s->tms_head;
    while (fp) {
        next = fp->next;
        free(fp);
        fp = next;
    }
    s->tms_head = NULL;
    s->tms_tail = NULL;

    if ((s->state == OMX_StateExecuting) || (s->state == OMX_StateIdle)) {
        ft_omx_port_flush(s, s->in_port, OMX_TRUE);
        ft_omx_port_flush(s, s->out_port, OMX_TRUE);

        if (s->state == OMX_StateExecuting) {
            OMX_SendCommand(s->handle, OMX_CommandStateSet, OMX_StateIdle, NULL);
            wait_for_state(s, OMX_StateIdle);
        }
        OMX_SendCommand(s->handle, OMX_CommandStateSet, OMX_StateLoaded, NULL);
        for (i = 0; i < s->num_in_buffers; i++) {
            OMX_BUFFERHEADERTYPE *buffer = get_buffer(&s->input_mutex, &s->input_cond,
                                                      &s->num_free_in_buffers, s->free_in_buffers, 1, 0);
            OMX_FreeBuffer(s->handle, s->in_port, buffer);
        }

        for (i = 0; i < s->num_out_buffers; i++) {
            OMX_BUFFERHEADERTYPE *buffer = s->out_buffer_headers[i];
            if (!buffer) {
                continue;
            }
            OMX_FreeBuffer(s->handle, s->out_port, buffer);
        }

        wait_for_state(s, OMX_StateLoaded);
    }

    if (s->handle) {
        s->omx_context->ptr_FreeHandle(s->handle);
        s->handle = NULL;
    }

    av_log(s->avctx, AV_LOG_DEBUG, "cleanup finish\n");

    omx_deinit(s->omx_context);
    s->omx_context = NULL;
    if (s->mutex_cond_inited) {
        pthread_cond_destroy(&s->state_cond);
        pthread_mutex_destroy(&s->state_mutex);
        pthread_cond_destroy(&s->input_cond);
        pthread_mutex_destroy(&s->input_mutex);
        pthread_cond_destroy(&s->output_cond);
        pthread_mutex_destroy(&s->output_mutex);
        pthread_cond_destroy(&s->port_cond);
        pthread_mutex_destroy(&s->port_mutex);
        pthread_cond_destroy(&s->flush_cond);
        pthread_mutex_destroy(&s->flush_mutex);
        pthread_cond_destroy(&s->buffer_cond);
        pthread_mutex_destroy(&s->buffer_mutex);
        s->mutex_cond_inited = 0;
    }

    av_freep(&s->in_buffer_headers);
    av_freep(&s->out_buffer_headers);
    av_freep(&s->free_in_buffers);
    av_freep(&s->done_out_buffers);
}

static void loadEnv(FTOMXCodecContext *s) {
    char *env = NULL;
    int convert = 0;

    s->low_latency = 0;
    env = getenv("FTOMX_LOW_LATENCY");
    if (env) {
        s->low_latency = atoi(env);
    }

#if defined(__aarch64__) || defined(__arm__)
    s->format_convert = OMX_TRUE; /* convert to I420 */
#endif
    env = getenv("FTOMX_FORMAT_CONVERT");
    if (env) {
        convert = atoi(env);
        if (convert) {
            s->format_convert = OMX_TRUE;
        } else {
            s->format_convert = OMX_FALSE;
        }
    }
    if (s->format_convert) {
        env = getenv("FTOMX_FORMAT_RGB");
        if (env) {
            convert = atoi(env);
            if (convert) {
                s->format_bgr = OMX_TRUE;
            }
        }
    }
}

static av_cold int ftomx_decode_init(AVCodecContext *avctx)
{
    FTOMXCodecContext *s = avctx->priv_data;
    int ret = AVERROR_ENCODER_NOT_FOUND;
    const char *role;

    //av_log_set_level(AV_LOG_TRACE);

    if (s->inited) {
        return 0;
    }

    s->omx_context = ftomx_init(avctx, s->libname);
    if (!s->omx_context) {
        return AVERROR_ENCODER_NOT_FOUND;
    }

    pthread_mutex_init(&s->state_mutex, NULL);
    pthread_cond_init(&s->state_cond, NULL);
    pthread_mutex_init(&s->port_mutex, NULL);
    pthread_cond_init(&s->port_cond, NULL);
    pthread_mutex_init(&s->flush_mutex, NULL);
    pthread_cond_init(&s->flush_cond, NULL);
    pthread_mutex_init(&s->input_mutex, NULL);
    pthread_cond_init(&s->input_cond, NULL);
    pthread_mutex_init(&s->output_mutex, NULL);
    pthread_cond_init(&s->output_cond, NULL);
    pthread_mutex_init(&s->buffer_mutex, NULL);
    pthread_cond_init(&s->buffer_cond, NULL);
    s->mutex_cond_inited = 1;
    s->avctx = avctx;
    s->state = OMX_StateLoaded;
    s->error = OMX_ErrorNone;

    switch (avctx->codec->id) {
    case AV_CODEC_ID_MPEG2VIDEO:
        role = "video_decoder.mpeg2";
        break;
    case AV_CODEC_ID_MPEG4:
        role = "video_decoder.mpeg4";
        break;
    case AV_CODEC_ID_H263:
        role = "video_decoder.h263";
        break;
    case AV_CODEC_ID_H264:
        role = "video_decoder.avc";
        break;
    case AV_CODEC_ID_HEVC:
        role = "video_decoder.hevc";
        break;
    case AV_CODEC_ID_VC1:
        role = "video_decoder.vc1";
        break;
    case AV_CODEC_ID_VP6:
        role = "video_decoder.vp6";
        break;
    case AV_CODEC_ID_VP8:
        role = "video_decoder.vp8";
        break;
    case AV_CODEC_ID_CAVS:
        role = "video_decoder.avs";
        break;
    case AV_CODEC_ID_RV30:
    case AV_CODEC_ID_RV40:
        role = "video_decoder.real";
        break;
    case AV_CODEC_ID_JPEG2000:
        //role = "video_decoder.mjpeg";
        role = "image_decoder.JPEG";
        break;
    case AV_CODEC_ID_MJPEG:
        //role = "video_decoder.mjpeg";
        role = "image_decoder.JPEG";
        break;
    case AV_CODEC_ID_FLV1:
        role = "video_decoder.sorenson";
        break;

    default:
        return AVERROR(ENOSYS);
    }

    if ((ret = find_component(s->omx_context, avctx, role, s->component_name, sizeof(s->component_name))) < 0)
        goto fail;

    av_log(avctx, AV_LOG_DEBUG, "Using %s\n", s->component_name);

    if ((ret = ft_component_init(avctx, role)) < 0)
        goto fail;

    s->last_timestamp = AV_NOPTS_VALUE;

    s->inited = OMX_TRUE;
    s->req_num = 0;
    s->done_num = 0;
    s->seeked = OMX_FALSE;
    s->format_convert = OMX_FALSE;
    s->reordered = OMX_FALSE;
    s->tms_head = NULL;
    s->tms_tail = NULL;

    loadEnv(s);

    if (avctx->has_b_frames) {
        s->low_latency = 0;
    }

    return 0;
fail:
    return ret;
}

static int ftomx_recv_new_pict(FTOMXCodecContext *s, AVFrame *pict, OMX_BUFFERHEADERTYPE* out_buffer) {
    OMX_ERRORTYPE err;
    int got = 0;
    int64_t pts = pict->pts;
    int64_t pkt_pos = pict->pkt_pos;
    int64_t pkt_duration = pict->pkt_duration;
    int32_t pkt_size = pict->pkt_size;

    if (s->reordered) {
        pict->reordered_opaque = pict->pts;
        //s->avctx->reordered_opaque = pict->pts;
    }

    if (!fill_frame_buffer(s, pict, out_buffer->pBuffer)) {
        s->done_num++;
        ftomx_reset_buffer(s, out_buffer);
        err = OMX_FillThisBuffer(s->handle, out_buffer);
        if (err != OMX_ErrorNone) {
            append_buffer(&s->output_mutex, &s->output_cond, &s->num_done_out_buffers, s->done_out_buffers, out_buffer);
            av_log(s->avctx, AV_LOG_ERROR, "OMX_FillThisBuffer failed: %x\n", err);
        }
        goto DONE;
    }

    /* refix pts(get_buffer may modify pts) */
    pict->pts = pts;
    pict->pkt_pos = pkt_pos;
    pict->pkt_duration = pkt_duration;
    pict->pkt_size = pkt_size;
    if (s->reordered) {
        pict->reordered_opaque = pict->pts;
    }

    s->done_num++;
    ftomx_reset_buffer(s, out_buffer);
    err = OMX_FillThisBuffer(s->handle, out_buffer);
    if (err != OMX_ErrorNone) {
        append_buffer(&s->output_mutex, &s->output_cond, &s->num_done_out_buffers, s->done_out_buffers, out_buffer);
        av_log(s->avctx, AV_LOG_ERROR, "OMX_FillThisBuffer failed: %x\n", err);
    }

    av_log(s->avctx, AV_LOG_TRACE, "display new frame, pts:0x%lx, dts:0x%lx\n", pict->pts, pict->pkt_dts);

    got = 1;

DONE:
    return got;
}

static void update_color_format(FTOMXCodecContext *s, OMX_COLOR_FORMATTYPE color_format) {
    AVCodecContext *avctx = s->avctx;

    switch (color_format) {
        //yuv420
        case OMX_COLOR_FormatYUV420PackedPlanar:
            //1.5x
            s->orig_format = AV_PIX_FMT_YUV420P;
            break;

        case OMX_COLOR_FormatYUV420SemiPlanar:
        case OMX_COLOR_FormatYUV420PackedSemiPlanar:
            //1.5x
            s->orig_format = AV_PIX_FMT_NV12;
            break;

        case OMX_COLOR_FormatYVU420SemiPlanar:
        case OMX_COLOR_FormatYVU420PackedSemiPlanar:
            //1.5x
            s->orig_format = AV_PIX_FMT_NV21;
            break;

            //10 bits format using most siginficant bits of a word (16b)
        case OMX_COLOR_Format10bitMsbYUV420PackedSemiPlanar:
            //3x
            s->orig_format = AV_PIX_FMT_P010LE; //6bit shift
            break;

        case OMX_COLOR_Format10bitMsbYVU420PackedSemiPlanar:
            //3x
            //10bit NV21
            break;

            //yuv422
        case OMX_COLOR_FormatYUV422PackedPlanar:
            //2x
            s->orig_format = AV_PIX_FMT_YUV422P;
            break;

        case OMX_COLOR_FormatYCbYCr:
            //2x
            s->orig_format = AV_PIX_FMT_YUYV422;
            break;

        case OMX_COLOR_FormatCbYCrY:
            //2x
            s->orig_format = AV_PIX_FMT_UYVY422;
            break;

        case OMX_COLOR_FormatYUV422PackedSemiPlanar:
            //2x
            s->orig_format = AV_PIX_FMT_NV16;
            break;

        case OMX_COLOR_FormatYVU422PackedSemiPlanar:
            //2x
            break;

        case OMX_COLOR_Format10bitMsbYUV422PackedSemiPlanar:
            //4x
            //mismatch: NV20LE offset=0, request offset=6;
            //fix when output
            s->orig_format = AV_PIX_FMT_NV20LE;
            break;

        case OMX_COLOR_Format10bitMsbYVU422PackedSemiPlanar:
            //4x
            break;

            //0b00CCCCCCCCCBBBBBBBBBBAAAAAAAAAA, 30bit from dword
            //每3个像素连续中间没有pad, Y与Y之间也是如此(每3个Y使用一个双字), 3个像素之后有2bit填充
            //这样3个像素合用dword
        case OMX_COLOR_Format10bitYUV422PackedSemiPlanar:
        case OMX_COLOR_Format10bitYVU422PackedSemiPlanar:
            //((width + 2) / 3) * 4 * height
            break;

#if 0 //not support
                //yuv444
        case OMX_COLOR_FormatYUV444PackedSemiPlanar:
            //3x
            s->orig_format = AV_PIX_FMT_NV24;
            break;

        case OMX_COLOR_FormatYVU444PackedSemiPlanar:
            //3x
            s->orig_format = AV_PIX_FMT_NV42;
            break;

        case OMX_COLOR_Format10bitMsbYUV444PackedSemiPlanar:
            //8x
            break;

        case OMX_COLOR_Format10bitMsbYVU444PackedSemiPlanar:
            //8x
            break;
#endif

        case OMX_COLOR_FormatL8:
            s->orig_format = AV_PIX_FMT_GRAY8;
            break;

        default:
            s->orig_format = AV_PIX_FMT_NV12;
            break;
    }
    avctx->pix_fmt = s->orig_format;
}

#if 0
static int align_to(int src, int align) {
    if (src % align) {
        return src + (align - src % align);
    }
    return src;
}
#endif

#if 0
/* split the uv plane to seperate plane */
static uint8_t *split_uv_plane(FTOMXCodecContext *s, int stride, int slice, uint8_t *buf) {
    uint8_t *uv_buf = NULL;
    int plane_size = 0;
    int uv_stride = 0;
    int size = 0;
    int l = 0;
    int i = 0, j = 0, k = 0;
    int idx = 0;
    unsigned long int *src[2] = {NULL, NULL};
    unsigned long int *dst[2] = {NULL, NULL};
    unsigned long int *cur = NULL;

    uv_stride = stride / 2;
    plane_size = uv_stride * slice / 2;
    size = plane_size * 2;
    l = stride / sizeof(unsigned long int);

    uv_buf = malloc(size);
    if (!uv_buf) {
        av_log(s->avctx, AV_LOG_ERROR, "malloc failed\n");
    } else {
        memset(uv_buf, 0, size);
        for (j = 0; j < slice / 2; j++) {
            src[0] = (unsigned long int *)(buf + stride * j);
            src[1] = src[0] + 1;
            dst[0] = (unsigned long int *)(uv_buf + uv_stride * j);
            dst[1] = (unsigned long int *)((uint8_t *)dst[0] + plane_size);
            for (i = 0; i < l / 2; i++) {
                for (k = 0; k < sizeof(unsigned long int); k++) {
                    if (2 * k < sizeof(unsigned long int)) {
                        cur = src[0];
                        idx = 2 * k;
                        *dst[0] |= (*cur & (0xfful << 8 * idx)) >> 8 * (idx - k);
                        *dst[1] |= (*cur & (0xfful << 8 * (idx + 1))) >> 8 * (idx + 1 - k);
                    } else {
                        cur = src[1];
                        idx = 2 * k - sizeof(unsigned long int);
                        *dst[0] |= (*cur & (0xfful << 8 * idx)) << 8 * (k - idx);
                        *dst[1] |= (*cur & (0xfful << 8 * (idx + 1))) << 8 * (k - idx - 1);
                    }
                }
                src[0] += 2;
                src[1] += 2;
                dst[0] += 1;
                dst[1] += 1;
            }
        }
    }

    return uv_buf;
}
#endif

static enum AVPixelFormat get_target_format(FTOMXCodecContext *s) {
    switch (s->orig_format) {
        case AV_PIX_FMT_NV12:
            if (s->format_bgr) {
                return AV_PIX_FMT_BGR0;
            } else {
                return AV_PIX_FMT_YUV420P;
            }

        case AV_PIX_FMT_NV16:
            return AV_PIX_FMT_YUV422P;

        case AV_PIX_FMT_P010LE:
            return AV_PIX_FMT_YUV420P10LE;

        case AV_PIX_FMT_NV20LE:
            return AV_PIX_FMT_YUV422P10LE;

        default:
            return s->orig_format;
    }
}

static void convert_pix_format(enum AVPixelFormat orig_fmt, AVFrame *pict, uint8_t *src_data[], int *src_linesizes) {
    switch (orig_fmt) {
        case AV_PIX_FMT_NV12:
            if (pict->format == AV_PIX_FMT_YUV420P) {
                NV12ToI420(src_data[0], src_linesizes[0], src_data[1], src_linesizes[1], pict->data[0], pict->linesize[0], pict->data[1], pict->linesize[1], pict->data[2], pict->linesize[2], pict->width, pict->height);
            } else if (pict->format == AV_PIX_FMT_BGR0) {
                NV12ToARGB(src_data[0], src_linesizes[0], src_data[1], src_linesizes[1], pict->data[0], pict->linesize[0], pict->width, pict->height);
            }
            break;

        case AV_PIX_FMT_NV16:
            if (pict->format == AV_PIX_FMT_YUV422P) {
                NV16ToI422(src_data[0], src_linesizes[0], src_data[1], src_linesizes[1], pict->data[0], pict->linesize[0], pict->data[1], pict->linesize[1], pict->data[2], pict->linesize[2], pict->width, pict->height);
            }
            break;

        case AV_PIX_FMT_NV20LE:
            if (pict->format == AV_PIX_FMT_YUV422P10LE) {
                NV20LETo422P10LE(src_data[0], src_linesizes[0], src_data[1], src_linesizes[1], pict->data[0], pict->linesize[0], pict->data[1], pict->linesize[1], pict->data[2], pict->linesize[2], pict->width, pict->height);
            }
            break;

        case AV_PIX_FMT_P010LE:
            if (pict->format == AV_PIX_FMT_YUV420P10LE) {
                P010LETo420P10LE(src_data[0], src_linesizes[0], src_data[1], src_linesizes[1], pict->data[0], pict->linesize[0], pict->data[1], pict->linesize[1], pict->data[2], pict->linesize[2], pict->width, pict->height);
            }
            break;

        default:
            break;
    }
}

static void get_source_pix_info(FTOMXCodecContext *s, enum AVPixelFormat pix_fmt, uint8_t *buf, int *src_linesizes, uint8_t *src_data[], int *dst_slice) {
    OMX_PARAM_PORTDEFINITIONTYPE *port_def = NULL;
    int stride, slice, height;

    port_def = &s->out_port_params;

    if (OMX_PortDomainVideo == port_def->eDomain) {
        stride = port_def->format.video.nStride;
        slice = port_def->format.video.nSliceHeight;
    } else {
        stride = port_def->format.image.nStride;
        slice = port_def->format.image.nSliceHeight;
    }

    if (OMX_PortDomainVideo == port_def->eDomain) {
        height = port_def->format.video.nFrameHeight;
    } else {
        height = port_def->format.image.nFrameHeight;
    }
    src_linesizes[0] = stride;
    dst_slice[0] = height;
    src_data[0] = buf;

    switch (pix_fmt) {
        case AV_PIX_FMT_YUV420P:
            if (OMX_PortDomainVideo == port_def->eDomain) {
                //1.5x
                src_linesizes[1] = stride / 2;
                src_linesizes[2] = stride / 2;
                dst_slice[1] = height / 2;
                dst_slice[2] = height / 2;
                src_data[1] = buf + stride * slice;
                src_data[2] = src_data[1] + stride / 2 * slice / 2;
            } else { //mjpeg, 2x
                src_linesizes[1] = stride;
                src_linesizes[2] = stride;
                dst_slice[1] = height / 2;
                dst_slice[2] = height / 2;
                src_data[1] = buf + stride * slice;
                src_data[2] = src_data[1] + stride * slice / 2;
            }
            break;

            //3x
        case AV_PIX_FMT_P010LE:
        case AV_PIX_FMT_P010BE:
            src_linesizes[1] = stride;
            src_data[1] = buf + stride * slice;
            //stride = 2 * aligned_width
            dst_slice[1] = height / 2;
            break;


            //3x, nv12, 10bit
        case AV_PIX_FMT_P016LE:
        case AV_PIX_FMT_P016BE:
            src_linesizes[1] = stride;
            src_data[1] = buf + stride * slice;
            //stride = 2 * aligned_width
            dst_slice[1] = height / 2;
            break;

            //2x
        case AV_PIX_FMT_YUV422P:
            if (OMX_PortDomainVideo == port_def->eDomain) {
                /* no support */
                src_linesizes[1] = stride / 2;
                src_linesizes[2] = stride / 2;
                src_data[1] = buf + stride * slice;
                src_data[2] = src_data[1] + stride * slice / 2;
                dst_slice[1] = height;
                dst_slice[2] = height;
            } else { //mjpeg, 3x
                src_linesizes[1] = stride;
                src_linesizes[2] = stride;
                src_data[1] = buf + stride * slice;
                src_data[2] = src_data[1] + stride * slice;
                dst_slice[1] = height;
                dst_slice[2] = height;
            }
            break;

#if 0 //not support
            //2x
        case AV_PIX_FMT_YUYV422:
        case AV_PIX_FMT_UYVY422:
            src_linesizes[0] = stride * 2;
            break;
#endif

            //2x, YUV422, UV order
        case AV_PIX_FMT_NV16:
            src_linesizes[1] = stride;
            src_data[1] = buf + stride * slice;
            dst_slice[1] = height;
            break;

            //4x, YUV422, nv20, 10bit
        case AV_PIX_FMT_NV20LE:
            src_linesizes[1] = stride;
            src_data[1] = buf + stride * slice;
            //convert to NV20
#if 0
            {
                unsigned short *ptr = (unsigned short *)src_data[0];
                unsigned short *end = (unsigned short *)(buf + stride * slice * 2);
                while (ptr < end) {
                    *ptr = (*ptr) >> 6;
                    ptr++;
                }
            }
#endif
            //stride = 2 * aligned_width
            dst_slice[1] = height;
            break;

#if 0 //not support
            //3x, YUV444
        case AV_PIX_FMT_NV24:
        case AV_PIX_FMT_NV42:
            src_linesizes[1] = stride * 2;
            src_data[1] = buf + stride * slice;
            break;
#endif

        case AV_PIX_FMT_GRAY8:
            break;

            //1.5x
        case AV_PIX_FMT_NV12:
        case AV_PIX_FMT_NV21:
        default:
            src_linesizes[1] = stride;
            src_data[1] = buf + stride * slice;
            dst_slice[1] = height / 2;
            break;
    }
}

static OMX_BOOL fill_frame_buffer(FTOMXCodecContext *s, AVFrame *pict, uint8_t *buf) {
    AVCodecContext *avctx = s->avctx;
    OMX_PARAM_PORTDEFINITIONTYPE *port_def = NULL;
    uint8_t *src_data[4] = {0};
    int src_linesizes[4] = {0};
    int dst_slice[4] = {0};
    enum AVPixelFormat orig_format = s->orig_format;
    enum AVPixelFormat out_format = orig_format;

    port_def = &s->out_port_params;

    /* convert nv12 to yuv420p(for chromium) */
    if (s->format_convert) {
        out_format = get_target_format(s);
    }

    pict->format = out_format;
    avctx->pix_fmt = out_format;
    if (!avctx->width) {
        if (OMX_PortDomainVideo == port_def->eDomain) {
            avctx->width = port_def->format.video.nFrameWidth; //display size
        } else {
            avctx->width = port_def->format.image.nFrameWidth; //display size
        }
    }
    if (!avctx->height) {
        if (OMX_PortDomainVideo == port_def->eDomain) {
            avctx->height = port_def->format.video.nFrameHeight;
        } else {
            avctx->height = port_def->format.image.nFrameHeight;
        }
    }
    pict->width = avctx->width;
    pict->height = avctx->height;

    if (ff_get_buffer(avctx, pict, 0) < 0) {
        av_log(s->avctx, AV_LOG_ERROR, "get_buffer failed, pix_fmt:%d, einval:%d\n", out_format, AVERROR(EINVAL));
        return OMX_FALSE;
    }

    get_source_pix_info(s, s->orig_format, buf, src_linesizes, src_data, dst_slice);

    if (out_format != s->orig_format) {
        convert_pix_format(s->orig_format, pict, src_data, src_linesizes);
    } else {
        av_image_copy(pict->data, pict->linesize, (const uint8_t**)&src_data, src_linesizes, avctx->pix_fmt, pict->width, pict->height);
    }

    return OMX_TRUE;
}

static void ftomx_reconfig_outport(FTOMXCodecContext *s) {
    AVCodecContext *avctx = s->avctx;
    OMX_ERRORTYPE err = OMX_ErrorNone;
    OMX_BUFFERHEADERTYPE* out_buffer = NULL;
    int i = 0;

    ft_omx_get_port_definition(s, s->out_port, &s->out_port_params);
    s->num_out_buffers = s->out_port_params.nBufferCountActual;

    if ((OMX_PortDomainVideo == s->out_port_params.eDomain) || (avctx->codec->id == AV_CODEC_ID_MJPEG)) {
        update_color_format(s, s->out_port_params.format.video.eColorFormat);
    } else {
        update_color_format(s, s->out_port_params.format.image.eColorFormat);
    }

    if (!s->out_port_params.bEnabled) {
        s->num_done_out_buffers = 0;

        if (!s->out_buffer_headers) {
            s->out_buffer_headers = av_mallocz(sizeof(OMX_BUFFERHEADERTYPE*) * s->num_out_buffers);
        }
        if (!s->done_out_buffers) {
            s->done_out_buffers   = av_mallocz(sizeof(OMX_BUFFERHEADERTYPE*) * s->num_out_buffers);
        }

        //startup output port
        ft_omx_switch_port(s, s->out_port, OMX_TRUE, OMX_FALSE);

        for (i = 0; i < s->num_out_buffers && err == OMX_ErrorNone; i++) {
            err = OMX_AllocateBuffer(s->handle, &s->out_buffer_headers[i], s->out_port, NULL, s->out_port_params.nBufferSize);
            s->out_buffer_headers[i]->pAppPrivate = s;
        }
        //CHECK(err);
        wait_for_port_onoff(s, s->out_port, OMX_TRUE);

        //OMX_SendCommand(s->handle, OMX_CommandStateSet, OMX_StateExecuting, NULL);
        //wait_for_state(s, OMX_StateExecuting);

        for (i = 0; i < s->num_out_buffers && err == OMX_ErrorNone; i++) {
            err = OMX_FillThisBuffer(s->handle, s->out_buffer_headers[i]); 
        }

        if (err != OMX_ErrorNone) {
            for (; i < s->num_out_buffers; i++)
                s->done_out_buffers[s->num_done_out_buffers++] = s->out_buffer_headers[i];
        }
        ft_omx_get_port_definition(s, s->out_port, &s->out_port_params);
    } else {
        ft_omx_port_flush(s, s->out_port, OMX_TRUE);

        av_log(avctx, AV_LOG_DEBUG, "reconfig: %d out buffer received\n", s->num_done_out_buffers);

        for (i = 0; i < s->num_out_buffers; i++) {
            out_buffer = get_buffer(&s->output_mutex, &s->output_cond,
                    &s->num_done_out_buffers, s->done_out_buffers, 1, 0);
            if (out_buffer->nFlags & OMX_BUFFERFLAG_EOS) {
                av_log(avctx, AV_LOG_DEBUG, "receive eos when reconfig\n");
            }
            ftomx_reset_buffer(s, out_buffer);
            OMX_FillThisBuffer(s->handle, out_buffer); 
        }
    }

    s->started = OMX_TRUE;
    av_log(s->avctx, AV_LOG_DEBUG, "reconfig outport finish\n");
}

static int ftomx_handle_pkt(AVCodecContext *avctx, void *data,
        int *got_frame, AVPacket *avpkt)
{
    FTOMXCodecContext *s = avctx->priv_data;
    OMX_BUFFERHEADERTYPE* buffer = NULL;
    OMX_BUFFERHEADERTYPE* out_buffer = NULL;
    OMX_ERRORTYPE err;
    AVFrame *pict      = data;
    int new_extradata_size;
    uint8_t *new_extradata;
    int ret = 0;
    int timeout = 100;

    const uint8_t *buf = avpkt->data;
    int buf_size       = avpkt->size;
    int64_t tms;
    int offset = 0;
    int wait = 0;

    av_log(avctx, AV_LOG_TRACE, "receive pkt pts:0x%lx, dts:0x%lx, reordered_opaque:0x%lx\n", avpkt->pts, avpkt->dts, avctx->reordered_opaque);

    //global extradata
    if (!s->extradata_decoded) {
        if (avctx->extradata && (avctx->extradata_size > 0)) {
            buffer = get_buffer(&s->input_mutex, &s->input_cond,
                    &s->num_free_in_buffers, s->free_in_buffers, 1, 0);

            if (buffer) {
                av_assert0(avctx->extradata_size <= buffer->nAllocLen);

                buffer->nFlags |= OMX_BUFFERFLAG_CODECCONFIG;
                buffer->nFlags |= OMX_BUFFERFLAG_ENDOFFRAME;
                buffer->nFilledLen = avctx->extradata_size;
                memcpy(buffer->pBuffer + buffer->nOffset, avctx->extradata, avctx->extradata_size);

                err = OMX_EmptyThisBuffer(s->handle, buffer);
                if (err != OMX_ErrorNone) {
                    append_buffer(&s->input_mutex, &s->input_cond, &s->num_free_in_buffers, s->free_in_buffers, buffer);
                    av_log(avctx, AV_LOG_ERROR, "OMX_EmptyThisBuffer failed: %x\n", err);
                    return AVERROR_UNKNOWN;
                }
            }
        }
        s->extradata_decoded = OMX_TRUE;
    }

    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
            &new_extradata_size);

    while (offset < buf_size) {
        wait = 0;

        //handle pkt side data
        if (new_extradata && new_extradata_size > 0) {
            buffer = get_buffer(&s->input_mutex, &s->input_cond,
                    &s->num_free_in_buffers, s->free_in_buffers, 0, 0);

            if (buffer) {
                av_assert0(new_extradata_size <= buffer->nAllocLen);

                buffer->nFlags |= OMX_BUFFERFLAG_ENDOFFRAME;
                buffer->nFilledLen = new_extradata_size;
                memcpy(buffer->pBuffer + buffer->nOffset, new_extradata, new_extradata_size);

                err = OMX_EmptyThisBuffer(s->handle, buffer);
                if (err != OMX_ErrorNone) {
                    append_buffer(&s->input_mutex, &s->input_cond, &s->num_free_in_buffers, s->free_in_buffers, buffer);
                    av_log(avctx, AV_LOG_ERROR, "OMX_EmptyThisBuffer failed: %x\n", err);
                    return AVERROR_UNKNOWN;
                }
                new_extradata = NULL;
                new_extradata_size = 0;
            }
        }

        if (!new_extradata) {
            buffer = get_buffer(&s->input_mutex, &s->input_cond,
                    &s->num_free_in_buffers, s->free_in_buffers, 0, 0);
        }

        if (buffer) {
            if (buffer->nAllocLen - buffer->nOffset <= 0) {
                OMX_EmptyThisBuffer(s->handle, buffer);

                av_log(avctx, AV_LOG_ERROR, "no enough buffer\n");
                return AVERROR(EAGAIN);
            }

            if ((0 == offset) && (avpkt->flags & AV_PKT_FLAG_KEY)) {
                buffer->nFlags |= OMX_BUFFERFLAG_SYNCFRAME;
            }

            buffer->nFilledLen = MIN(buf_size - offset, buffer->nAllocLen - buffer->nOffset);
            memcpy(buffer->pBuffer + buffer->nOffset, buf + offset, buffer->nFilledLen);

            if (0 == offset) {
                tms = reorder_tms(s, avpkt);
            }

            offset += buffer->nFilledLen;

            // Convert the timestamps to microseconds; some encoders can ignore
            // the framerate and do VFR bit allocation based on timestamps.
            //buffer->nTimeStamp = to_omx_ticks(pts);
            buffer->nTimeStamp = to_omx_ticks(tms);
            s->last_timestamp = tms;

            if (offset == buf_size) {
                s->req_num++;
                buffer->nFlags |= OMX_BUFFERFLAG_ENDOFFRAME;
                av_log(avctx, AV_LOG_TRACE, "req:%ld, pts:0x%lx\n", s->req_num, tms);
            }

            err = OMX_EmptyThisBuffer(s->handle, buffer);
            if (err != OMX_ErrorNone) {
                append_buffer(&s->input_mutex, &s->input_cond, &s->num_free_in_buffers, s->free_in_buffers, buffer);
                av_log(avctx, AV_LOG_ERROR, "OMX_EmptyThisBuffer failed: %x\n", err);
                return AVERROR_UNKNOWN;
            }
        } else if (*got_frame) {
            //drop frame after flush when blocked
            out_buffer = get_buffer(&s->output_mutex, &s->output_cond,
                    &s->num_done_out_buffers, s->done_out_buffers, 0, 0);
            if (out_buffer) {
                av_log(s->avctx, AV_LOG_INFO, "drop frame after flush");

                ftomx_reset_buffer(s, out_buffer);
                err = OMX_FillThisBuffer(s->handle, out_buffer);
                if (err != OMX_ErrorNone) {
                    append_buffer(&s->output_mutex, &s->output_cond, &s->num_done_out_buffers, s->done_out_buffers, out_buffer);
                    av_log(s->avctx, AV_LOG_ERROR, "OMX_FillThisBuffer failed: %x\n", err);
                }
            }
        }

        /* reconfig output port */
        if (!s->started && (offset == buf_size) && (s->configed || !s->low_latency)) { //low latency
            wait_port_configed(s);
            ftomx_reconfig_outport(s);
        }

        //handle decoded buffer, then release input buffer to fill cur packet
        if (!*got_frame && s->started) {
            wait = 1;
            if (buffer && (avctx->has_b_frames || s->low_latency)) {
                wait = 0;
            }

            if (!wait && s->low_latency) {
                wait = 1;
                timeout = 2;
            }
retry:
            out_buffer = get_buffer(&s->output_mutex, &s->output_cond,
                    &s->num_done_out_buffers, s->done_out_buffers, wait, timeout);
            if (out_buffer) {
                if (out_buffer->nFlags & OMX_BUFFERFLAG_EOS) {
                    s->got_eos = OMX_TRUE;
                    av_log(avctx, AV_LOG_ERROR, "unexpected EOS flag received\n");
                    ftomx_reset_buffer(s, out_buffer);
                    append_buffer(&s->output_mutex, &s->output_cond, &s->num_done_out_buffers, s->done_out_buffers, out_buffer);
                    goto END;
                } else if (!out_buffer->nFilledLen) {
                    av_log(avctx, AV_LOG_ERROR, "empty outbuf:0x%lx\n", (long int)out_buffer);
                    if (out_buffer = dequeue_tms(s, out_buffer, pict)) {
                        ftomx_reset_buffer(s, out_buffer);
                        err = OMX_FillThisBuffer(s->handle, out_buffer); 
                    }
                    goto retry;
                } else {
                    out_buffer = dequeue_tms(s, out_buffer, pict);
                    if (out_buffer) {
                        *got_frame = ftomx_recv_new_pict(s, pict, out_buffer);
                        if (!*got_frame) {
                            goto END;
                        }
                    } else if (buffer) {
                        goto retry;
                    }
                }
            }
        }
    }

    if (!*got_frame) {
        ret = AVERROR(EAGAIN);
    }

    return ret;

END:
    //s->started = OMX_FALSE;
    s->eos_sent = OMX_FALSE;
    s->extradata_decoded = OMX_FALSE;

    ret = AVERROR_EOF;
    av_log(avctx, AV_LOG_INFO, "Stream finish\n");

    return ret;
}

static int ftomx_handle_eof(AVCodecContext *avctx, void *data,
        int *got_frame, AVPacket *avpkt) {
    FTOMXCodecContext *s = avctx->priv_data;
    OMX_BUFFERHEADERTYPE* buffer = NULL;
    OMX_BUFFERHEADERTYPE* out_buffer = NULL;
    OMX_ERRORTYPE err;
    AVFrame *pict      = data;
    int ret = 0;

    if (s->got_eos || !s->started) {
        return AVERROR_EOF;
    }

    /* send eof */
    if (!s->eos_sent) {
        buffer = get_buffer(&s->input_mutex, &s->input_cond,
                            &s->num_free_in_buffers, s->free_in_buffers, 1, 0);

        buffer->nFilledLen = 0;
        buffer->nFlags = OMX_BUFFERFLAG_EOS;
        buffer->pAppPrivate = s;
        err = OMX_EmptyThisBuffer(s->handle, buffer);
        if (err != OMX_ErrorNone) {
            append_buffer(&s->input_mutex, &s->input_cond, &s->num_free_in_buffers, s->free_in_buffers, buffer);
            av_log(avctx, AV_LOG_ERROR, "OMX_EmptyThisBuffer failed: %x\n", err);
            goto END;
        }
        s->eos_sent = OMX_TRUE;
        av_log(avctx, AV_LOG_INFO, "sent eos\n");
    }

RETRY:
    out_buffer = get_buffer(&s->output_mutex, &s->output_cond,
        &s->num_done_out_buffers, s->done_out_buffers, OMX_TRUE, 0);

    if (out_buffer->nFlags & OMX_BUFFERFLAG_EOS) {
        av_log(avctx, AV_LOG_INFO, "receive eos\n");
        s->got_eos = OMX_TRUE;
    }

    if (out_buffer->nFilledLen > 0) {
        out_buffer = dequeue_tms(s, out_buffer, pict);
        *got_frame = ftomx_recv_new_pict(s, pict, out_buffer);
        if (!*got_frame) {
            goto END;
        }
    } else if (!s->got_eos) {
        out_buffer = dequeue_tms(s, out_buffer, pict);
        ftomx_reset_buffer(s, out_buffer);
        append_buffer(&s->output_mutex, &s->output_cond, &s->num_done_out_buffers, s->done_out_buffers, out_buffer);
        goto RETRY;
    }

    if (*got_frame) {
        goto DONE;
    } else if (!s->got_eos) {
        ret = AVERROR(EAGAIN);
        av_log(avctx, AV_LOG_ERROR, "get frame failed\n");
        goto DONE;
    }

END:
    //s->started = OMX_FALSE;
    s->eos_sent = OMX_FALSE;
    s->extradata_decoded = OMX_FALSE;
    ret = AVERROR_EOF;

    av_log(avctx, AV_LOG_INFO, "Stream finish\n");

DONE:
    return ret;
}

static int ftomx_decode_frame(AVCodecContext *avctx, void *data,
        int *got_frame, AVPacket *avpkt)
{
    FTOMXCodecContext *s = avctx->priv_data;

    *got_frame = 0;

    if (avpkt && avpkt->size && !s->eos_sent) {
        return ftomx_handle_pkt(avctx, data, got_frame, avpkt);
    }

    return ftomx_handle_eof(avctx, data, got_frame, avpkt);
}

#if 0
static int ftomx_recv_frame(AVCodecContext *avctx, AVFrame *frame)
{
    FTOMXCodecContext *s = avctx->priv_data;
    AVPacket avpkt = {0};
    int got_frame = 0;

    int ret = 0;

    ret = ff_decode_get_packet(avctx, &avpkt);
    if (ret < 0 && ret != AVERROR_EOF)
        return ret;

    if (avpkt.size && !s->eos_sent) {
        return ftomx_handle_pkt(avctx, frame, &got_frame, &avpkt);
    }

    return ftomx_handle_eof(avctx, frame, &got_frame, &avpkt);
}
#endif

static av_cold int ftomx_decodec_end(AVCodecContext *avctx)
{
    FTOMXCodecContext *s = avctx->priv_data;
    cleanup(s);
    memset(s, 0, sizeof(*s));

    return 0;
}

#define OFFSET(x) offsetof(FTOMXCodecContext, x)
static const AVOption options[] = {
    { "omx_libname", "OpenMAX library name", OFFSET(libname), AV_OPT_TYPE_STRING, { 0 }, 0, 0, AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM },
    { NULL }
};

static const enum AVPixelFormat ftomx_decoder_pix_fmts[] = {
    AV_PIX_FMT_YUV420P, AV_PIX_FMT_NV12, AV_PIX_FMT_NV16, AV_PIX_FMT_NV20LE, AV_PIX_FMT_NONE
};

#define FTOMXDEC(NAME, LONGNAME, CODEC, bsf_name) \
static const AVClass ftomx_ ## NAME ## _dec_class = {\
    .class_name = #NAME "_ftomx_decoder",\
    .item_name  = av_default_item_name,\
    .option     = options,\
    .version    = LIBAVUTIL_VERSION_INT,\
};\
\
AVCodec ff_ ## NAME ## _ftomx_decoder = { \
    .name           = #NAME "ftomx" ,\
    .long_name      = NULL_IF_CONFIG_SMALL("ft omx " LONGNAME " decoder wrapper"),\
    .type           = AVMEDIA_TYPE_VIDEO,\
    .id             = CODEC ,\
    .priv_data_size = sizeof(FTOMXCodecContext),\
    .priv_class     = &ftomx_ ## NAME ## _dec_class,\
    .pix_fmts       = (enum AVPixelFormat*)&ftomx_decoder_pix_fmts, \
    .init           = ftomx_decode_init,\
    .decode         = ftomx_decode_frame,\
    /*.receive_frame  = ftomx_recv_frame,*/\
    .flush          = ftomx_flush, \
    .close          = ftomx_decodec_end,\
    .bsfs           = bsf_name, \
    .capabilities   = AV_CODEC_CAP_HARDWARE |  AV_CODEC_CAP_AVOID_PROBING| AV_CODEC_CAP_DELAY | AV_CODEC_CAP_DR1, \
    .wrapper_name   = "ftomx", \
};

FTOMXDEC(h263,  "H.263", AV_CODEC_ID_H263,       NULL);
FTOMXDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");
FTOMXDEC(hevc,  "HEVC",  AV_CODEC_ID_HEVC,       "hevc_mp4toannexb");
FTOMXDEC(mpeg2, "MPEG2", AV_CODEC_ID_MPEG2VIDEO, NULL);
FTOMXDEC(mpeg4, "MPEG4", AV_CODEC_ID_MPEG4,      NULL);
FTOMXDEC(vc1 ,  "VC1",   AV_CODEC_ID_VC1,        NULL);
FTOMXDEC(vp6,   "VP6",   AV_CODEC_ID_VP6,        NULL);
FTOMXDEC(vp8,   "VP8",   AV_CODEC_ID_VP8,        NULL);
FTOMXDEC(cavs,   "CAVS",   AV_CODEC_ID_CAVS,        NULL);
//FTOMXDEC(jpeg, "JPEG",   AV_CODEC_ID_JPEG2000,        NULL);
FTOMXDEC(mjpeg, "MJPEG",   AV_CODEC_ID_MJPEG,        NULL);
FTOMXDEC(rv30,  "RV30",   AV_CODEC_ID_RV30,        NULL);
FTOMXDEC(rv40,  "RV40",   AV_CODEC_ID_RV40,        NULL);
FTOMXDEC(flv,  "FLV",   AV_CODEC_ID_FLV1,        NULL);
